diff --git a/.condaignore b/.condaignore new file mode 100644 index 0000000..763513e --- /dev/null +++ b/.condaignore @@ -0,0 +1 @@ +.ipynb_checkpoints diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..ca2d606 --- /dev/null +++ b/.gitignore @@ -0,0 +1,11 @@ +test* +test_* +.idea +vinautil.egg-info +dist +data +build +*.pyc +__pycache__ +.ipynb_checkpoints/ +.ipynb_checkpoints diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..7e82c85 --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2022 lingyu zeng + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/README.md b/README.md index 5e57e87..a97b205 100644 --- a/README.md +++ b/README.md @@ -1,93 +1,87 @@ -# vinautil +English | [中文](./README_cn.md) +## Introduction +All the code in this repository was developed by Zeng Lingyu during his pursuit of a graduate degree at the College of Biological Engineering and Food Science, Hubei University of Technology. Some of the code can be used to reproduce Zeng Lingyu's research work during his master's degree and is provided for reference. -## Getting started +These codes are mainly developed for [AutoDock Vina](https://vina.scripps.edu/) and [SCARdock](https://pubs.acs.org/doi/10.1021/acs.jcim.6b00334). -To make it easy for you to get started with GitLab, here's a list of recommended next steps. +[SCARdock website](https://scardock.com/) for screening covalent inhibitors. -Already a pro? Just edit this README.md and make it your own. Want to make it easy? [Use the template at the bottom](#editing-this-readme)! +It can also handle PDB files and small molecule files. -## Add your files - -- [ ] [Create](https://docs.gitlab.com/ee/user/project/repository/web_editor.html#create-a-file) or [upload](https://docs.gitlab.com/ee/user/project/repository/web_editor.html#upload-a-file) files -- [ ] [Add files using the command line](https://docs.gitlab.com/ee/gitlab-basics/add-file.html#add-a-file-using-the-command-line) or push an existing Git repository with the following command: +## Environment require +```yaml +- python =3.10 +- bioconda::mgltools +- conda-forge::spyrmsd +- conda-forge::pandas +- conda-forge::openbabel +- conda-forge::rdkit +- conda-forge::pymol-open-source 2.5.0 py310h292d129_6 +- conda-forge::loguru +- conda-forge::swig +- conda-forge::boost-cpp +- conda-forge::sphinx +- conda-forge::sphinx_rtd_theme +- conda-forge::vina 1.2.3 py310he924329_2 +- conda-forge::ipython +- conda-forge::biopython +- conda-forge::prody # meeko dependency (optionally, for covalent docking) ``` -cd existing_repo -git remote add origin http://jihugitlab.dockless.eu.org/lingyuzeng/vinautil.git -git branch -M main -git push -uf origin main -``` - -## Integrate with your tools - -- [ ] [Set up project integrations](http://jihugitlab.dockless.eu.org/lingyuzeng/vinautil/-/settings/integrations) - -## Collaborate with your team - -- [ ] [Invite team members and collaborators](https://docs.gitlab.com/ee/user/project/members/) -- [ ] [Create a new merge request](https://docs.gitlab.com/ee/user/project/merge_requests/creating_merge_requests.html) -- [ ] [Automatically close issues from merge requests](https://docs.gitlab.com/ee/user/project/issues/managing_issues.html#closing-issues-automatically) -- [ ] [Enable merge request approvals](https://docs.gitlab.com/ee/user/project/merge_requests/approvals/) -- [ ] [Set auto-merge](https://docs.gitlab.com/ee/user/project/merge_requests/merge_when_pipeline_succeeds.html) - -## Test and Deploy - -Use the built-in continuous integration in GitLab. - -- [ ] [Get started with GitLab CI/CD](https://docs.gitlab.com/ee/ci/quick_start/index.html) -- [ ] [Analyze your code for known vulnerabilities with Static Application Security Testing (SAST)](https://docs.gitlab.com/ee/user/application_security/sast/) -- [ ] [Deploy to Kubernetes, Amazon EC2, or Amazon ECS using Auto Deploy](https://docs.gitlab.com/ee/topics/autodevops/requirements.html) -- [ ] [Use pull-based deployments for improved Kubernetes management](https://docs.gitlab.com/ee/user/clusters/agent/) -- [ ] [Set up protected environments](https://docs.gitlab.com/ee/ci/environments/protected_environments.html) - -*** - -# Editing this README - -When you're ready to make this README your own, just edit this file and use the handy template below (or feel free to structure it however you want - this is just a starting point!). Thanks to [makeareadme.com](https://www.makeareadme.com/) for this template. - -## Suggestions for a good README - -Every project is different, so consider which of these sections apply to yours. The sections used in the template are suggestions for most open source projects. Also keep in mind that while a README can be too long and detailed, too long is better than too short. If you think your README is too long, consider utilizing another form of documentation rather than cutting out information. - -## Name -Choose a self-explaining name for your project. - -## Description -Let people know what your project can do specifically. Provide context and add a link to any reference visitors might be unfamiliar with. A list of Features or a Background subsection can also be added here. If there are alternatives to your project, this is a good place to list differentiating factors. - -## Badges -On some READMEs, you may see small images that convey metadata, such as whether or not all the tests are passing for the project. You can use Shields to add some to your README. Many services also have instructions for adding a badge. - -## Visuals -Depending on what you are making, it can be a good idea to include screenshots or even a video (you'll frequently see GIFs rather than actual videos). Tools like ttygif can help, but check out Asciinema for a more sophisticated method. - -## Installation -Within a particular ecosystem, there may be a common way of installing things, such as using Yarn, NuGet, or Homebrew. However, consider the possibility that whoever is reading your README is a novice and would like more guidance. Listing specific steps helps remove ambiguity and gets people to using your project as quickly as possible. If it only runs in a specific context like a particular programming language version or operating system or has dependencies that have to be installed manually, also add a Requirements subsection. ## Usage -Use examples liberally, and show the expected output if you can. It's helpful to have inline the smallest example of usage that you can demonstrate, while providing links to more sophisticated examples if they are too long to reasonably include in the README. -## Support -Tell people where they can go to for help. It can be any combination of an issue tracker, a chat room, an email address, etc. +### Installation -## Roadmap -If you have ideas for releases in the future, it is a good idea to list them in the README. +```shell +conda create -n vinautil_env -c pylyzeng vinautil --yes +conda activate vinautil_env +``` -## Contributing -State if you are open to contributions and what your requirements are for accepting them. +### Test -For people who want to make changes to your project, it's helpful to have some documentation on how to get started. Perhaps there is a script that they should run or some environment variables that they need to set. Make these steps explicit. These instructions could also be useful to your future self. +```shell +# before test need execute: +scardock -h +scardocktest +``` -You can also document commands to lint the code or run tests. These steps help to ensure high code quality and reduce the likelihood that the changes inadvertently break something. Having instructions for running tests is especially helpful if it requires external setup, such as starting a Selenium server for testing in a browser. +### Docking -## Authors and acknowledgment -Show your appreciation to those who have contributed to the project. +```shell +scardock --help +usage: scardock [-h] [-r [recepotr file]] [-l [ligand file]] [-s [residue covalent site]] [-c [covalent chain ID]] + [-log [output log directory]] + +SCARdock Docking + +options: + -h, --help show this help message and exit + -r [recepotr file], --receptor [recepotr file] + recepotr file, support pdb + -l [ligand file], --ligand [ligand file] + ligand file, molecule file (MOL2, SDF,...)(use meeko prepare) + -s [residue covalent site], --site [residue covalent site] + residue covalent site + -c [covalent chain ID], --chain [covalent chain ID] + covalent chain ID + -log [output log directory], --log_dir [output log directory] + Relative Path +``` + +## Code Installation + +It is recommended to use conda for installation. + +## Acknowledgement + +[Sen Liu](https://sgsp.hbut.edu.cn/info/1085/1794.htm) + +[Qi Song](https://sgsp.hbut.edu.cn/info/1087/1813.htm) + +[Lab site](http://www.liugroup.site) ## License -For open source projects, say how it is licensed. - -## Project status -If you have run out of energy or time for your project, put a note at the top of the README saying that development has slowed down or stopped completely. Someone may choose to fork your project or volunteer to step in as a maintainer or owner, allowing your project to keep going. You can also make an explicit request for maintainers. +[MIT](./LICENSE) \ No newline at end of file diff --git a/README_cn.md b/README_cn.md new file mode 100644 index 0000000..34092e3 --- /dev/null +++ b/README_cn.md @@ -0,0 +1,88 @@ +English(./README.md) | [中文] + +## 简介 +本仓库所有的代码,为曾令宇在湖北工业大学生物工程与食品学院攻读研究生学位时开发。部分代码可以用于复现曾令宇硕士期间的工作成果,代码供参考。 + +这些代码主要是为 [autodock vina](https://vina.scripps.edu/) 和 [SCARdock](https://pubs.acs.org/doi/10.1021/acs.jcim.6b00334) 开发。 + +[SCARdock 网站](https://scardock.com)筛选共价抑制剂。 + +同时也可以处理PDB文件以及小分子文件 + +## 环境 + +```yaml +- python =3.10 +- bioconda::mgltools +- conda-forge::spyrmsd +- conda-forge::pandas +- conda-forge::openbabel +- conda-forge::rdkit +- conda-forge::pymol-open-source 2.5.0 py310h292d129_6 +- conda-forge::loguru +- conda-forge::swig +- conda-forge::boost-cpp +- conda-forge::sphinx +- conda-forge::sphinx_rtd_theme +- conda-forge::vina 1.2.3 py310he924329_2 +- conda-forge::ipython +- conda-forge::biopython +- conda-forge::prody # meeko dependecy (optionally, for covalent docking) +``` + +## 使用 + +### 安装 + +```shell +conda create -n vinautil_env -c pylyzeng vinautil --yes +conda activate vinautil_env +``` + +### 测试 + +```shell +# 在测试前需要执行 +scardock -h +scardocktest +``` + +### 对接 + +```shell +scardock --help +usage: scardock [-h] [-r [recepotr file]] [-l [ligand file]] [-s [residue covalent site]] [-c [covalent chain ID]] + [-log [output log directory]] + +SCARdock Docking + +options: + -h, --help show this help message and exit + -r [recepotr file], --receptor [recepotr file] + recepotr file, support pdb + -l [ligand file], --ligand [ligand file] + ligand file, molecule file (MOL2, SDF,...)(use meeko prepare) + -s [residue covalent site], --site [residue covalent site] + residue covalent site + -c [covalent chain ID], --chain [covalent chain ID] + covalent chain ID + -log [output log directory], --log_dir [output log directory] + Relative Path +``` + +### 代码安装 + +推荐使用conda安装 + +### [构建](./conda_pack.md) + +## 致谢 + +[Sen Liu](https://sgsp.hbut.edu.cn/info/1085/1794.htm) + +[Qi Song](https://sgsp.hbut.edu.cn/info/1087/1813.htm) + +[Lab site](http://www.liugroup.site) + +## 许可协议 +[MIT](./LICENSE) \ No newline at end of file diff --git a/bld.bat b/bld.bat new file mode 100644 index 0000000..b9cd616 --- /dev/null +++ b/bld.bat @@ -0,0 +1,2 @@ +"%PYTHON%" setup.py install --single-version-externally-managed --record=record.txt +if errorlevel 1 exit 1 diff --git a/build.sh b/build.sh new file mode 100644 index 0000000..06d208a --- /dev/null +++ b/build.sh @@ -0,0 +1,5 @@ +# install shell script for the project +$PYTHON -m pip install . --no-deps --ignore-installed -vv +git clone https://github.com/forlilab/Meeko +cd Meeko +$PYTHON -m pip install . --no-deps --ignore-installed -vv \ No newline at end of file diff --git a/conda_pack.md b/conda_pack.md new file mode 100644 index 0000000..c22d569 --- /dev/null +++ b/conda_pack.md @@ -0,0 +1,19 @@ +## install vinautil + +```shell +conda create -n my_env_test -c pylyzeng vinautil -y +``` + +## build anaconda package + +```shell +conda create -n condabuild python=3 -y +conda activate condabuild +conda install conda-build anaconda-client -y +cd +anaconda login +conda build . +anaconda upload /path/to/conda-package.tar.bz2 +``` + + diff --git a/environment.yml b/environment.yml new file mode 100644 index 0000000..6196553 --- /dev/null +++ b/environment.yml @@ -0,0 +1,330 @@ +name: SCAR +channels: + - microsoft + - oddt + - conda-forge + - http://mirrors.tuna.tsinghua.edu.cn/anaconda/pkgs/main + - https://mirrors.aliyun.com/anaconda/cloud/ursky + - https://mirrors.aliyun.com/anaconda/cloud/stackless + - https://mirrors.aliyun.com/anaconda/cloud/simpleitk + - https://mirrors.aliyun.com/anaconda/cloud/rdkit + - https://mirrors.aliyun.com/anaconda/cloud/rapidsai + - https://mirrors.aliyun.com/anaconda/cloud/qiime2 + - https://mirrors.aliyun.com/anaconda/cloud/pyviz + - https://mirrors.aliyun.com/anaconda/cloud/pytorch3d + - https://mirrors.aliyun.com/anaconda/cloud/pytorch-test + - https://mirrors.aliyun.com/anaconda/cloud/pytorch + - https://mirrors.aliyun.com/anaconda/cloud/psi4 + - https://mirrors.aliyun.com/anaconda/cloud/plotly + - https://mirrors.aliyun.com/anaconda/cloud/omnia + - https://mirrors.aliyun.com/anaconda/cloud/ohmeta + - https://mirrors.aliyun.com/anaconda/cloud/numba + - https://mirrors.aliyun.com/anaconda/cloud/msys2 + - https://mirrors.aliyun.com/anaconda/cloud/mordred-descriptor + - https://mirrors.aliyun.com/anaconda/cloud/menpo + - https://mirrors.aliyun.com/anaconda/cloud/matsci + - https://mirrors.aliyun.com/anaconda/cloud/intel + - https://mirrors.aliyun.com/anaconda/cloud/idaholab + - https://mirrors.aliyun.com/anaconda/cloud/fermi + - https://mirrors.aliyun.com/anaconda/cloud/fastai + - https://mirrors.aliyun.com/anaconda/cloud/dglteam + - https://mirrors.aliyun.com/anaconda/cloud/deepmodeling + - https://mirrors.aliyun.com/anaconda/cloud/conda-forge + - https://mirrors.aliyun.com/anaconda/cloud/caffe2 + - https://mirrors.aliyun.com/anaconda/cloud/c4aarch64 + - https://mirrors.aliyun.com/anaconda/cloud/bioconda + - https://mirrors.aliyun.com/anaconda/cloud/biobakery + - https://mirrors.aliyun.com/anaconda/cloud/auto + - https://mirrors.aliyun.com/anaconda/cloud/Paddle + - https://mirrors.aliyun.com/anaconda/pkgs/r + - https://mirrors.aliyun.com/anaconda/pkgs/msys2 + - https://mirrors.aliyun.com/anaconda/pkgs/main + - https://mirrors.aliyun.com/anaconda/pkgs/free + - https://levinthal:paradox@conda.graylab.jhu.edu + - defaults +dependencies: + - alabaster=0.7.12=pyhd3eb1b0_0 + - argon2-cffi=21.3.0=pyhd8ed1ab_0 + - argon2-cffi-bindings=21.2.0=py39hb82d6ee_2 + - arrow-cpp=8.0.0=py39hbd6f097_0 + - asttokens=2.0.5=pyhd8ed1ab_0 + - attrs=21.4.0=pyhd8ed1ab_0 + - aws-c-common=0.6.8=he774522_1 + - aws-c-event-stream=0.1.6=h84adeff_1 + - aws-checksums=0.1.11=h7f1f1e2_2 + - aws-sdk-cpp=1.8.185=h748b112_1 + - babel=2.9.1=pyhd3eb1b0_0 + - backcall=0.2.0=pyh9f0ad1d_0 + - backports=1.0=py_2 + - backports.functools_lru_cache=1.6.4=pyhd8ed1ab_0 + - beautifulsoup4=4.11.1=pyha770c72_0 + - biopython=1.79=py39hb82d6ee_2 + - bleach=5.0.0=pyhd8ed1ab_0 + - blosc=1.21.1=hcbbf2c4_0 + - boost=1.74.0=py39hefe7e4c_5 + - boost-cpp=1.74.0=h9f4b32c_8 + - brotli=1.0.9=h8ffe710_7 + - brotli-bin=1.0.9=h8ffe710_7 + - brotli-python=1.0.9=py39h99910a6_8 + - brotlipy=0.7.0=py39h2bbff1b_1003 + - bzip2=1.0.8=h8ffe710_4 + - c-ares=1.18.1=h2bbff1b_0 + - ca-certificates=2022.9.24=h5b45459_0 + - cairo=1.16.0=h60892f0_1002 + - certifi=2022.9.24=pyhd8ed1ab_0 + - cffi=1.15.0=py39h0878f49_0 + - cfitsio=3.470=h2bbff1b_7 + - cftime=1.6.2=py39hc266a54_0 + - charls=2.2.0=h6c2663c_0 + - charset-normalizer=2.0.4=pyhd3eb1b0_0 + - click=8.1.3=win_pyhd8ed1ab_2 + - cloudpickle=2.2.0=pyhd8ed1ab_0 + - colorama=0.4.4=pyh9f0ad1d_0 + - cryptography=37.0.1=py39h21b164f_0 + - curl=7.82.0=h2bbff1b_0 + - cycler=0.11.0=pyhd8ed1ab_0 + - cytoolz=0.12.0=py39hb82d6ee_0 + - dash=2.7.0=pyhd8ed1ab_0 + - dash-bootstrap-components=1.2.1=pyhd8ed1ab_0 + - dash-core-components=2.0.0=pyhd8ed1ab_1 + - dash-daq=0.5.0=pyh9f0ad1d_1 + - dash-html-components=2.0.0=pyhd8ed1ab_1 + - dash_colorscales=0.0.4=pyh9f0ad1d_0 + - dask-core=2022.9.2=pyhd8ed1ab_0 + - debugpy=1.6.0=py39h415ef7b_0 + - decorator=5.1.1=pyhd8ed1ab_0 + - defusedxml=0.7.1=pyhd8ed1ab_0 + - docutils=0.18.1=py39haa95532_3 + - dtale=2.9.0=pyhd8ed1ab_0 + - entrypoints=0.4=pyhd8ed1ab_0 + - et_xmlfile=1.0.1=py_1001 + - executing=0.8.3=pyhd8ed1ab_0 + - fasteners=0.17.3=pyhd8ed1ab_0 + - flask=2.2.2=pyhd8ed1ab_0 + - flask-compress=1.13=pyhd8ed1ab_0 + - flit-core=3.7.1=pyhd8ed1ab_0 + - fonttools=4.25.0=pyhd3eb1b0_0 + - freetype=2.10.4=h546665d_1 + - fsspec=2022.8.2=pyhd8ed1ab_0 + - future=0.18.2=pyhd8ed1ab_6 + - gflags=2.2.2=hf6abc00_0 + - giflib=5.2.1=h8d14728_2 + - glew=2.1.0=h39d44d4_2 + - glm=0.9.9.4=h5362a0b_2 + - glog=0.4.0=hdd1ffc6_1 + - greenlet=1.1.3=py39h415ef7b_0 + - griddataformats=1.0.1=pyhd8ed1ab_0 + - gsd=2.6.0=py39h5d4886f_0 + - h5py=2.10.0=nompi_py39hf27771d_106 + - hdf4=4.2.15=h0e5069d_3 + - hdf5=1.10.6=nompi_h5268f04_1114 + - icu=64.2=he025d50_1 + - idna=3.3=pyhd3eb1b0_0 + - imagecodecs=2021.8.26=py39hc0a7faf_1 + - imageio=2.22.0=pyhfa7a67d_0 + - imagesize=1.4.1=py39haa95532_0 + - importlib-metadata=4.11.3=py39hcbf5309_1 + - importlib_resources=5.7.1=pyhd8ed1ab_1 + - intel-openmp=2022.0.0=h57928b3_3663 + - ipykernel=6.13.0=py39h832f523_0 + - ipython=8.3.0=py39hcbf5309_0 + - ipython_genutils=0.2.0=py_1 + - ipywidgets=7.7.0=pyhd8ed1ab_0 + - itsdangerous=2.1.2=pyhd8ed1ab_0 + - jedi=0.18.1=py39hcbf5309_1 + - jinja2=3.1.2=pyhd8ed1ab_0 + - joblib=1.2.0=pyhd8ed1ab_0 + - jpeg=9e=h8ffe710_1 + - jsonschema=4.5.1=pyhd8ed1ab_0 + - jupyter=1.0.0=py39hcbf5309_7 + - jupyter_client=7.3.1=pyhd8ed1ab_0 + - jupyter_console=6.4.3=pyhd8ed1ab_0 + - jupyter_core=4.10.0=py39hcbf5309_0 + - jupyterlab_pygments=0.2.2=pyhd8ed1ab_0 + - jupyterlab_widgets=1.1.0=pyhd8ed1ab_0 + - kiwisolver=1.4.4=py39h2e07f2f_0 + - lcms2=2.12=h2a16943_0 + - lerc=3.0=h0e60522_0 + - libaec=1.0.6=h39d44d4_0 + - libblas=3.9.0=14_win64_mkl + - libbrotlicommon=1.0.9=h8ffe710_7 + - libbrotlidec=1.0.9=h8ffe710_7 + - libbrotlienc=1.0.9=h8ffe710_7 + - libcblas=3.9.0=14_win64_mkl + - libcurl=7.82.0=h86230a5_0 + - libdeflate=1.8=h2bbff1b_5 + - libffi=3.4.2=h8ffe710_5 + - libiconv=1.16=he774522_0 + - liblapack=3.9.0=14_win64_mkl + - libnetcdf=4.8.1=nompi_hf689e7d_100 + - libpng=1.6.37=h1d00b33_2 + - libprotobuf=3.20.1=h23ce68f_0 + - libsodium=1.0.18=h8d14728_1 + - libssh2=1.10.0=h680486a_2 + - libthrift=0.15.0=h636ae23_1 + - libtiff=4.4.0=h8a3f274_0 + - libwebp=1.2.4=h8ffe710_0 + - libwebp-base=1.2.4=h8ffe710_0 + - libxml2=2.9.14=hf5bbc77_0 + - libzip=1.9.2=hfed4ece_1 + - libzlib=1.2.13=hcfcfb64_4 + - libzopfli=1.0.3=h0e60522_0 + - locket=1.0.0=pyhd8ed1ab_0 + - lz4=4.0.2=py39hf617134_0 + - lz4-c=1.9.3=h8ffe710_1 + - m2w64-gcc-libgfortran=5.3.0=6 + - m2w64-gcc-libs=5.3.0=7 + - m2w64-gcc-libs-core=5.3.0=7 + - m2w64-gmp=6.1.0=2 + - m2w64-libwinpthread-git=5.0.0.4634.697f757=2 + - markupsafe=2.1.1=py39hb82d6ee_1 + - matplotlib-base=3.5.3=py39h581301d_0 + - matplotlib-inline=0.1.3=pyhd8ed1ab_0 + - mdanalysis=2.3.0=py39h2ba5b7c_0 + - missingno=0.4.2=py_1 + - mistune=0.8.4=py39hb82d6ee_1005 + - mkl=2022.0.0=h0e2418a_796 + - mmtf-python=1.1.3=pyhd8ed1ab_0 + - mrcfile=1.4.3=pyhd8ed1ab_0 + - msgpack-python=1.0.4=py39h2e07f2f_0 + - msys2-conda-epoch=20160418=1 + - munkres=1.1.4=pyh9f0ad1d_0 + - nbclient=0.6.3=pyhd8ed1ab_0 + - nbconvert=6.5.0=pyhd8ed1ab_0 + - nbconvert-core=6.5.0=pyhd8ed1ab_0 + - nbconvert-pandoc=6.5.0=pyhd8ed1ab_0 + - nbformat=5.4.0=pyhd8ed1ab_0 + - nest-asyncio=1.5.5=pyhd8ed1ab_0 + - netcdf4=1.5.7=py39h3de5c98_1 + - networkx=2.8.7=pyhd8ed1ab_0 + - notebook=6.4.11=pyha770c72_0 + - numpy=1.22.3=py39h0948cea_2 + - numpydoc=1.4.0=py39haa95532_0 + - oddt=0.7=py_0 + - openbabel=3.1.1=py39hfc62d72_3 + - openjpeg=2.5.0=hb211442_0 + - openpyxl=3.0.9=pyhd8ed1ab_0 + - openssl=1.1.1s=hcfcfb64_0 + - packaging=21.3=pyhd8ed1ab_0 + - pandas=1.4.3=py39h2e25243_0 + - pandoc=2.18=h57928b3_0 + - pandocfilters=1.5.0=pyhd8ed1ab_0 + - parso=0.8.3=pyhd8ed1ab_0 + - partd=1.3.0=pyhd8ed1ab_0 + - patsy=0.5.3=pyhd8ed1ab_0 + - pickleshare=0.7.5=py_1003 + - pillow=9.2.0=py39hdc2b20a_1 + - pip=22.1=pyhd8ed1ab_0 + - pixman=0.38.0=hfa6e2cd_1003 + - playwright=v1.27.1=py39_0 + - plotly=5.11.0=pyhd8ed1ab_0 + - pmw=2.0.1=py39hcbf5309_1006 + - prometheus_client=0.14.1=pyhd8ed1ab_0 + - prompt-toolkit=3.0.29=pyha770c72_0 + - prompt_toolkit=3.0.29=hd8ed1ab_0 + - psutil=5.9.0=py39hb82d6ee_1 + - pure_eval=0.2.2=pyhd8ed1ab_0 + - pyarrow=8.0.0=py39h953b917_0 + - pycairo=1.21.0=py39h1f09dad_1 + - pycparser=2.21=pyhd8ed1ab_0 + - pyee=8.1.0=pyhd8ed1ab_0 + - pygments=2.12.0=pyhd8ed1ab_0 + - pymol-open-source=2.5.0=py39h5616056_6 + - pyopenssl=22.0.0=pyhd3eb1b0_0 + - pyparsing=3.0.9=pyhd8ed1ab_0 + - pyqt=5.9.2=py39hd77b12b_6 + - pyreadline=2.1=py39hcbf5309_1006 + - pyrsistent=0.18.1=py39hb82d6ee_1 + - pysocks=1.7.1=py39haa95532_0 + - python=3.9.12=h9a09f29_1_cpython + - python-dateutil=2.8.2=pyhd8ed1ab_0 + - python-fastjsonschema=2.15.3=pyhd8ed1ab_0 + - python_abi=3.9=2_cp39 + - pytz=2022.1=pyhd8ed1ab_0 + - pywavelets=1.3.0=py39h5d4886f_1 + - pywin32=303=py39hb82d6ee_0 + - pywinpty=2.0.5=py39h99910a6_1 + - pyyaml=6.0=py39hb82d6ee_4 + - pyzmq=22.3.0=py39he46f08e_2 + - qt=5.9.7=h506e8af_3 + - qtconsole=5.3.0=pyhd8ed1ab_0 + - qtconsole-base=5.3.0=pyhd8ed1ab_0 + - qtpy=2.1.0=pyhd8ed1ab_0 + - rdkit=2022.03.5=py39hfadf033_0 + - re2=2022.04.01=h0e60522_0 + - reportlab=3.5.68=py39h12d40c5_1 + - requests=2.28.1=pyhd8ed1ab_0 + - scikit-image=0.19.3=py39h2e25243_1 + - scikit-learn=1.1.2=py39hfd4428b_0 + - scipy=1.9.1=py39h316f440_0 + - seaborn=0.12.0=hd8ed1ab_0 + - seaborn-base=0.12.0=pyhd8ed1ab_0 + - send2trash=1.8.0=pyhd8ed1ab_0 + - setuptools=62.2.0=py39hcbf5309_0 + - sip=4.19.13=py39hd77b12b_0 + - six=1.16.0=pyh6c4a22f_0 + - snappy=1.1.9=h82413e6_1 + - snowballstemmer=2.2.0=pyhd3eb1b0_0 + - soupsieve=2.3.1=pyhd8ed1ab_0 + - sphinx=5.0.2=py39haa95532_0 + - sphinxcontrib-applehelp=1.0.2=pyhd3eb1b0_0 + - sphinxcontrib-devhelp=1.0.2=pyhd3eb1b0_0 + - sphinxcontrib-htmlhelp=2.0.0=pyhd3eb1b0_0 + - sphinxcontrib-jsmath=1.0.1=pyhd3eb1b0_0 + - sphinxcontrib-qthelp=1.0.3=pyhd3eb1b0_0 + - sphinxcontrib-serializinghtml=1.1.5=pyhd3eb1b0_0 + - spyrmsd=0.5.2=pyhd8ed1ab_0 + - sqlalchemy=1.4.13=py39hb82d6ee_0 + - sqlite=3.38.5=h8ffe710_0 + - squarify=0.4.3=py_0 + - stack_data=0.2.0=pyhd8ed1ab_0 + - statsmodels=0.13.2=py39h5d4886f_0 + - strsimpy=0.1.9=pyh9f0ad1d_0 + - tbb=2021.5.0=h2d74725_1 + - tenacity=8.1.0=pyhd8ed1ab_0 + - terminado=0.13.3=py39hcbf5309_1 + - threadpoolctl=3.1.0=pyh8a188c0_0 + - tidynamics=1.0.0=py_0 + - tifffile=2021.4.8=pyhd8ed1ab_0 + - tinycss2=1.1.1=pyhd8ed1ab_0 + - tinydb=4.7.0=pyhd8ed1ab_0 + - tk=8.6.12=h8ffe710_0 + - toolz=0.12.0=pyhd8ed1ab_0 + - tornado=6.1=py39hb82d6ee_3 + - tqdm=4.64.1=pyhd8ed1ab_0 + - traitlets=5.2.1.post0=pyhd8ed1ab_0 + - typing_extensions=4.4.0=pyha770c72_0 + - tzdata=2022a=h191b570_0 + - ucrt=10.0.20348.0=h57928b3_0 + - urllib3=1.26.11=py39haa95532_0 + - utf8proc=2.6.1=h5343397_0 + - vc=14.2=hb210afc_6 + - vs2015_runtime=14.29.30139=h890b9b1_8 + - wcwidth=0.2.5=pyh9f0ad1d_2 + - webencodings=0.5.1=py_1 + - werkzeug=2.2.2=pyhd8ed1ab_0 + - wheel=0.37.1=pyhd8ed1ab_0 + - widgetsnbextension=3.6.0=py39hcbf5309_0 + - win_inet_pton=1.1.0=py39haa95532_0 + - winpty=0.4.3=4 + - xarray=2022.11.0=pyhd8ed1ab_0 + - xlrd=2.0.1=pyhd8ed1ab_3 + - xz=5.2.6=h8d14728_0 + - yaml=0.2.5=h8ffe710_2 + - zeromq=4.3.4=h0e60522_1 + - zfp=0.5.5=h0e60522_8 + - zipp=3.8.0=pyhd8ed1ab_0 + - zlib=1.2.13=hcfcfb64_4 + - zstd=1.5.2=h7755175_4 + - pip: + - htmlmin==0.1.12 + - imagehash==4.3.1 + - meeko==0.3.3 + - multimethod==1.9 + - pandas-profiling==3.4.0 + - phik==0.12.2 + - pydantic==1.10.2 + - sklearn==0.0.post1 + - tangled-up-in-unicode==0.2.0 + - visions==0.7.5 +prefix: C:\ProgramData\Miniconda3\envs\SCAR diff --git a/meta.yaml b/meta.yaml new file mode 100644 index 0000000..f9fb39b --- /dev/null +++ b/meta.yaml @@ -0,0 +1,49 @@ +package: + name: vinautil + version: 0.0.7 + +source: + path: . + +build: + number: 4 + skip: True # [not linux64] +# script: python -m pip install . --no-deps --ignore-installed -vv + +requirements: + host: + - python =3.10 + - conda-forge::pyyaml + - pip + - setuptools + - git + run: + - python =3.10 + - conda-forge::pdbfixer + - conda-forge::openmm + - bioconda::mgltools + - conda-forge::spyrmsd + - conda-forge::pandas + - conda-forge::openbabel + - conda-forge::rdkit + - conda-forge::pymol-open-source 2.5.0 py310h292d129_6 + - conda-forge::loguru + - conda-forge::swig + - conda-forge::boost-cpp + - conda-forge::sphinx + - conda-forge::sphinx_rtd_theme + - conda-forge::vina 1.2.3 py310he924329_2 + - conda-forge::ipython + - conda-forge::biopython + - conda-forge::prody # meeko dependecy (optionally, for covalent docking) + +about: + home: https://github.com/hotwa/vinautil.git + license: MIT + summary: 'autodock vina tools' + description: | + for autodock vina transform pqbqt to mol2, restore chemical bond information. + make config file for autodock vina. add SCARdock + dev_url: https://github.com/hotwa/vinautil.git + doc_url: https://github.com/hotwa/vinautil.git + doc_source_url: https://github.com/hotwa/vinautil.git diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..c6b2015 --- /dev/null +++ b/setup.py @@ -0,0 +1,48 @@ +from setuptools import setup # auto find packages please import find_packages +from pathlib import Path +import yaml + +with open('meta.yaml', 'r') as file: + doc = yaml.load(file, Loader=yaml.FullLoader) + version = doc['package']['version'] + +here = Path(__file__).parent.resolve() +long_description = (here / "README.md").read_text(encoding="utf-8") + +setup( + name='vinautil', + version=version, + description=( + 'a tool for autodock vina' + ), + long_description=long_description, + long_description_content_type="text/markdown", + url='https://github.com/hotwa/vinautil.git', + author='lingyu zeng', + author_email='pylyzeng@gmail.com', + license='MIT License', + classifiers=[ + 'Operating System :: OS Independent', + 'License :: OSI Approved :: MIT License', + 'Programming Language :: Python :: 3', + "Programming Language :: Python :: 3 :: Only", + ], + keywords=["tools", "autodock vina"], + package_dir={"": "vinautil"}, # Optional + packages=['vinautil', 'vinautil.vutils', 'vinautil.pymolutils'], + python_requires='>=3.9', + install_requires=[], + dependency_links=[ + "https://github.com/openbabel/openbabel" + ], + platforms=["linux-64"], + entry_points={ + 'console_scripts': [ + 'scardock = vinautil.scardock:SCARdock', # Replace 'my_module' with the actual module that contains SCARdock function + 'scardocktest = vinautil.scardock:SCARdocktest' + ], + }, + package_data={ + 'vinautil': ['test/*'], + } +) diff --git a/vinautil/__init__.py b/vinautil/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/vinautil/vinautil/Supporting_Information_Table_S1-3.xlsx b/vinautil/vinautil/Supporting_Information_Table_S1-3.xlsx new file mode 100644 index 0000000..94a2c76 Binary files /dev/null and b/vinautil/vinautil/Supporting_Information_Table_S1-3.xlsx differ diff --git a/vinautil/vinautil/__init__.py b/vinautil/vinautil/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/vinautil/vinautil/config.py b/vinautil/vinautil/config.py new file mode 100644 index 0000000..6594e05 --- /dev/null +++ b/vinautil/vinautil/config.py @@ -0,0 +1,193 @@ +#!/usr/bin/env python +# -*- encoding: utf-8 -*- +''' +@file :configutils.py +@Description: : generate config file for autodock vina +@Date :2022/12/28 21:58:08 +@Author :hotwa +@version :1.1 +''' +import sys +from pathlib import Path +here = Path(__file__).parent.resolve() +from dataclasses import dataclass, field +from typing import Iterable +from openbabel import pybel +try: + from .typecheck import typeassert +except ImportError: + from vinautil.utils.typecheck import typeassert + +@typeassert(file=Path, fmt=str) +class molecule: + file: Path + + def __init__(self,file: Path, fmt:str=None): + self.file = file + self.fmt = fmt if bool(fmt) else file.suffix.replace('.','') + + def get_coord_lst(self, addHs:bool = True): + molH = pybel.readfile(format = self.fmt, filename=self.file.__str__()) + molH = next(molH) + # molH.OBMol.DeleteHydrogens() + if addHs: molH.OBMol.AddPolarHydrogens() + return [list(atom.coords) for atom in molH] + + def get_coord_dict(self, addHs:bool = True): + molH = pybel.readfile(format = self.fmt, filename=self.file.__str__()) + molH = next(molH) + if addHs: molH.OBMol.AddPolarHydrogens() + return {atom.idx: atom.coords for atom in molH} + + def compute_box(self, extending = 10): + coords_lst = self.get_coord_lst() + xlst, ylst, zlst = [i[0] for i in coords_lst], [i[1] for i in coords_lst], [i[2] for i in coords_lst] + ([minX, minY, minZ], [maxX, maxY, maxZ]) = ([min(xlst), min(ylst), min(zlst)], [max(xlst), max(ylst), max(zlst)]) + minX = minX - float(extending) + minY = minY - float(extending) + minZ = minZ - float(extending) + maxX = maxX + float(extending) + maxY = maxY + float(extending) + maxZ = maxZ + float(extending) + SizeX = maxX - minX + SizeY = maxY - minY + SizeZ = maxZ - minZ + CenterX = (maxX + minX) / 2 + CenterY = (maxY + minY) / 2 + CenterZ = (maxZ + minZ) / 2 + c = { + 'x': float("{:.2f}".format(CenterX)), + 'y': float("{:.2f}".format(CenterY)), + 'z': float("{:.2f}".format(CenterZ)), + } + s = { + 'x': float("{:.2f}".format(SizeX)), + 'y': float("{:.2f}".format(SizeY)), + 'z': float("{:.2f}".format(SizeZ)) + } + rd = { + 'center': xyz_point(**c), + 'box_size': xyz_point(**s) + } + return rd + +@typeassert(x=(float, int), y=(float, int), z=(float, int)) +@dataclass +class xyz_point: + x: (float, int) + y: (float, int) + z: (float, int) + + def __repr__(self): + return f"xyz_point(x={self.x}, y={self.y}, z={self.z})" + + def __str__(self): + return f"three demensional point: x={self.x}, y={self.y}, z={self.z}" + + @classmethod + def from_iterable(cls,iterable_object:Iterable): + if isinstance(iterable_object,Iterable): + return cls(*iterable_object) + else: + raise TypeError('iterable_object must be Iterable') + +@typeassert(receptor=Path, + ligand=Path, + box_size=(xyz_point, type(None)), + center=(xyz_point, type(None)), + outpdbqtfile=Path, + exhaustiveness=int, + num_modes=int, + energy_range=int) +@dataclass +class vinaconfig: + receptor: Path + ligand: Path + outpdbqtfile: Path + center: (xyz_point, type(None)) = field(default_factory=lambda: None) + box_size: (xyz_point, type(None)) = field(default_factory=lambda: None) + exhaustiveness: int = field(default_factory=lambda: 32) + num_modes: int = field(default_factory=lambda: 20) + energy_range: int = field(default_factory=lambda: 5) + extend: int = field(default_factory=lambda: 10) + + def __str__(self): + return f"""the class of autodock vina config file:\n\nreceptor: {self.receptor}\nligand: {self.ligand}\ncenter: {self.center}\nbox_size: {self.box_size}\noutpdbqtfile: {self.outpdbqtfile}\nexhaustiveness: {self.exhaustiveness}\nnum_modes: {self.num_modes}\nenergy_range: {self.energy_range}\nextend: {self.extend}\n""" + + def __post_init__(self): + if (self.center == None) and (self.box_size == None): + m = molecule(self.receptor) + box = m.compute_box(extending = self.extend) + self.center = box['center'] + self.box_size = box['box_size'] + + def to_dict(self)->dict: + return {'receptor': self.receptor, + 'ligand': self.ligand, + 'center': self.center, + 'box_size': self.box_size, + 'outpdbqtfile': self.outpdbqtfile, + 'exhaustiveness': self.exhaustiveness, + 'num_modes': self.num_modes, + 'energy_range': self.energy_range} + + def to_txt(self, file:Path, + cx = None, + cy = None, + cz = None, + sx = None, + sy = None, + sz = None, + exhaustiveness = None, + num_modes = None, + energy_range = None, + absolute_path=False) -> None: + """to_txt save config to file supported autodock vina 1.1.2 and 1.2.3 binary file + + use: vina --config config.txt + + Arguments: + file {file path} -- out put file + """ + if absolute_path: + receptor_path = self.receptor.absolute().as_posix() + ligand_path = self.ligand.absolute().as_posix() + outpdbqtfile_path = self.outpdbqtfile.absolute().as_posix() + else: + receptor_path = self.receptor.as_posix() + ligand_path = self.ligand.as_posix() + outpdbqtfile_path = self.outpdbqtfile.as_posix() + exhaustiveness = exhaustiveness if exhaustiveness else self.exhaustiveness + num_modes = num_modes if num_modes else self.num_modes + energy_range = energy_range if energy_range else self.energy_range + cx = cx if cx else self.center.x + cy = cy if cy else self.center.y + cz = cz if cz else self.center.z + sx = sx if sx else self.box_size.x + sy = sy if sy else self.box_size.y + sz = sz if sz else self.box_size.z + content = f''' +receptor = {receptor_path} +ligand = {ligand_path} + +center_x = {cx} +center_y = {cy} +center_z = {cz} + +size_x = {sx} +size_y = {sy} +size_z = {sz} + + +exhaustiveness = {exhaustiveness} + +num_modes = {num_modes} + +energy_range = {energy_range} + +out = {outpdbqtfile_path} +''' + file,file_dir = Path(file), Path(file).parent + if not file_dir.exists(): file_dir.mkdir(parents=True) + with open(file.absolute().as_posix(), 'w', encoding='utf-8') as f: + f.write(content) \ No newline at end of file diff --git a/vinautil/vinautil/getbox_pymol.py b/vinautil/vinautil/getbox_pymol.py new file mode 100644 index 0000000..d68cfa0 --- /dev/null +++ b/vinautil/vinautil/getbox_pymol.py @@ -0,0 +1,280 @@ +# -*- coding: utf-8 -*- +from pymol.cgo import * +from pymol import cmd +from random import randint +from pymol.vfont import plain +import sys + + +############################################################################## +# GetBox Plugin.py -- Draws a box surrounding a selection and gets box information +# This script is used to get box information for LeDock, Autodock Vina and AutoDock Vina. +# Copyright (C) 2014 by Mengwu Xiao (Hunan University) +# +# USAGES: See function GetBoxHelp() +# REFERENCE: drawBoundingBox.py written by Jason Vertrees +# EMAIL: mwxiao AT hnu DOT edu DOT cn +# Changes: +# 2014-07-30 first version was uploaded to BioMS http://bioms.org/forum.php?mod=viewthread&tid=1234 +# 2018-02-04 uploaded to GitHub https://github.com/MengwuXiao/Getbox-PyMOL-Plugin +# fixed some bugs: python 2/3 and PyMOL 1.x are supported; +# added support to AutoDock; +# added tutorials in English; +# 2023-03-05 uploaded to GitHub https://github.com/hotwa/vinautil +# add function getvinabox and from_complex_get_box update by lingyuzeng(mail: pylyzeng@gmail.com) +# NOTES: +# This program is free software; you can redistribute it and/or modify it under the terms of the GNU +# General Public License as published by the Free Software Foundation version 3 of the License. +# +# This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without +# even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See +# the GNU General Public License for more details. +############################################################################## + + +def __init__(self): + self.menuBar.addcascademenu('Plugin', 'GetBox Plugin', 'GetBox PyMOL Plugin', label='GetBox Plugin') + self.menuBar.addmenuitem('GetBox Plugin', 'command', 'GetBoxHelp', label='Advanced usage', + command=lambda s=self: GetBoxHelp()) + self.menuBar.addmenuitem('GetBox Plugin', 'command', 'AutoBox', label='Autodetect box', + command=lambda s=self: autobox()) + self.menuBar.addmenuitem('GetBox Plugin', 'command', 'GetBox', label='Get box from selection (sele) ', + command=lambda s=self: getbox()) + self.menuBar.addmenuitem('GetBox Plugin', 'command', 'Remove HETATM', label='Remove HETATM ', + command=lambda s=self: rmhet()) + + +# to deal with print +def printf(str): + if sys.version < '3': + exec("print str") + else: + exec("print(str)") + + +def GetBoxHelp(): + Usages = '''get latest plugin and tutorials at https://github.com/MengwuXiao/Getbox-PyMOL-Plugin + +Usages: +this plugin is a simple tool to get box information for LeDock and Autodock Vina or other molecular docking soft. Using the following functions to get box is recommended. + +* autobox [extending] (NOTES: solvent & some anions will be removed) + this function autodetects box in chain A with one click of mouse, but sometimes it fails for too many ligands or no ligand + e.g. autobox + +* getbox [selection = (sele), [extending = 5.0]] + this function creates a box that around the selected objects (residues or ligands or HOH or others). Selecting ligands or residues in the active cavity reported in papers is recommended + e.g. getbox + e.g. getbox (sele), 6.0 + +* resibox [Residues String, [extending = 5.0]] + this function creates a box that arroud the input residues in chain A. Selecting residues in the active cavity reported in papers is recommended\n\ + e.g. resibox resi 214+226+245, 8.0 + e.g. resibox resi 234 + resn HEM, 6.0 + +* showbox [minX, maxX, minY, maxY, minZ, maxZ] + this function creates a box based on the input axis, used to visualize box or amend box coordinate + e.g. showbox 2,3,4,5,6,7 + + * rmhet + remove HETATM, remove all HETATM in the screen + +Notes: +* If you have any questions or advice, please do not hesitate to contact me (mwxiao AT hnu DOT edu DOT cn), thank you!''' + + printf(Usages) + return + + +def showaxes(minX, minY, minZ): + cmd.delete('axes') + w = 0.5 # cylinder width + l = 5.0 # cylinder length + obj = [ + CYLINDER, minX, minY, minZ, minX + l, minY, minZ, w, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, + CYLINDER, minX, minY, minZ, minX, minY + l, minZ, w, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, + CYLINDER, minX, minY, minZ, minX, minY, minZ + l, w, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, + ] + cyl_text(obj, plain, [minX + l, minY, minZ - w], 'X', 0.20, axes=[[3, 0, 0], [0, 3, 0], [0, 0, 3]]) + cyl_text(obj, plain, [minX - w, minY + l, minZ], 'Y', 0.20, axes=[[3, 0, 0], [0, 3, 0], [0, 0, 3]]) + cyl_text(obj, plain, [minX - w, minY, minZ + l], 'Z', 0.20, axes=[[3, 0, 0], [0, 3, 0], [0, 0, 3]]) + cmd.load_cgo(obj, 'axes') + return + + +def showbox(minX, maxX, minY, maxY, minZ, maxZ): + linewidth = 3.0 + minX = float(minX) + minY = float(minY) + minZ = float(minZ) + maxX = float(maxX) + maxY = float(maxY) + maxZ = float(maxZ) + showaxes(minX, minY, minZ) + boundingBox = [ + LINEWIDTH, float(linewidth), + BEGIN, LINES, + # x lines + COLOR, 1.0, 0.0, 0.0, # red + VERTEX, minX, minY, minZ, # 1 + VERTEX, maxX, minY, minZ, # 5 + + VERTEX, minX, maxY, minZ, # 3 + VERTEX, maxX, maxY, minZ, # 7 + + VERTEX, minX, maxY, maxZ, # 4 + VERTEX, maxX, maxY, maxZ, # 8 + + VERTEX, minX, minY, maxZ, # 2 + VERTEX, maxX, minY, maxZ, # 6 + # y lines + COLOR, 0.0, 1.0, 0.0, # green + VERTEX, minX, minY, minZ, # 1 + VERTEX, minX, maxY, minZ, # 3 + + VERTEX, maxX, minY, minZ, # 5 + VERTEX, maxX, maxY, minZ, # 7 + + VERTEX, minX, minY, maxZ, # 2 + VERTEX, minX, maxY, maxZ, # 4 + + VERTEX, maxX, minY, maxZ, # 6 + VERTEX, maxX, maxY, maxZ, # 8 + # z lines + COLOR, 0.0, 0.0, 1.0, # blue + VERTEX, minX, minY, minZ, # 1 + VERTEX, minX, minY, maxZ, # 2 + + VERTEX, minX, maxY, minZ, # 3 + VERTEX, minX, maxY, maxZ, # 4 + + VERTEX, maxX, minY, minZ, # 5 + VERTEX, maxX, minY, maxZ, # 6 + + VERTEX, maxX, maxY, minZ, # 7 + VERTEX, maxX, maxY, maxZ, # 8 + + END + ] + boxName = "box_" + str(randint(0, 10000)) + while boxName in cmd.get_names(): + boxName = "box_" + str(randint(0, 10000)) + cmd.load_cgo(boundingBox, boxName) + SizeX = maxX - minX + SizeY = maxY - minY + SizeZ = maxZ - minZ + CenterX = (maxX + minX) / 2 + CenterY = (maxY + minY) / 2 + CenterZ = (maxZ + minZ) / 2 + BoxCode = "BoxCode(" + boxName + ") = showbox %0.1f, %0.1f, %0.1f, %0.1f, %0.1f, %0.1f" % ( + minX, maxX, minY, maxY, minZ, maxZ) + # output LeDock input file + LeDockBox = "*********LeDock Binding Pocket*********\n" + \ + "Binding pocket\n%.1f %.1f\n%.1f %.1f\n%.1f %.1f\n" % (minX, maxX, minY, maxY, minZ, maxZ) + # output AutoDock Vina input file + VinaBox = "*********AutoDock Vina Binding Pocket*********\n" + \ + "--center_x %.1f --center_y %.1f --center_z %.1f --size_x %.1f --size_y %.1f --size_z %.1f\n" % ( + CenterX, CenterY, CenterZ, SizeX, SizeY, SizeZ) + # output AutoDock box information + # add this function in 2016-6-25 by mwxiao + AutoDockBox = "*********AutoDock Grid Option*********\n" + \ + "npts %d %d %d # num. grid points in xyz\n" % (SizeX / 0.375, SizeY / 0.375, SizeZ / 0.375) + \ + "spacing 0.375 # spacing (A)\n" + \ + "gridcenter %.3f %.3f %.3f # xyz-coordinates or auto\n" % (CenterX, CenterY, CenterZ) + + printf(VinaBox) + printf(AutoDockBox) + printf(LeDockBox) + printf(BoxCode) + cmd.zoom(boxName) + # cmd.show('surface') + return boxName + + +def getbox(selection="(sele)", extending=5.0): + cmd.hide("spheres") + cmd.show("spheres", selection) + ([minX, minY, minZ], [maxX, maxY, maxZ]) = cmd.get_extent(selection) + minX = minX - float(extending) + minY = minY - float(extending) + minZ = minZ - float(extending) + maxX = maxX + float(extending) + maxY = maxY + float(extending) + maxZ = maxZ + float(extending) + cmd.zoom(showbox(minX, maxX, minY, maxY, minZ, maxZ)) + return + + +# remove ions +def removeions(): + cmd.select("Ions", "((resn PO4) | (resn SO4) | (resn ZN) | (resn CA) | (resn MG) | (resn CL)) & hetatm") + cmd.remove("Ions") + cmd.delete("Ions") + return + + +# autodedect box +def autobox(extending=5.0): + cmd.remove('solvent') + removeions() + cmd.select("ChainAHet", "hetatm & chain A") # found error in pymol 1.8 change "chain a" to "chain A" + getbox("ChainAHet", extending) + return + + +# remove hetatm +def rmhet(extending=5.0): + cmd.select("rmhet", "hetatm") + cmd.remove("rmhet") + return + + +# getbox from cavity residues that reported in papers +def resibox(ResiduesStr="", extending=5.0): + cmd.select("Residues", ResiduesStr + " & chain A") + getbox("Residues", extending) + return + + +def getvinabox(selection="(sele)", extending=5.0) -> dict: + # 获取 sele 对象的autodock vina盒子大小 + cmd.hide("spheres") + cmd.show("spheres", selection) + ([minX, minY, minZ], [maxX, maxY, maxZ]) = cmd.get_extent(selection) + minX = minX - float(extending) + minY = minY - float(extending) + minZ = minZ - float(extending) + maxX = maxX + float(extending) + maxY = maxY + float(extending) + maxZ = maxZ + float(extending) + SizeX = maxX - minX + SizeY = maxY - minY + SizeZ = maxZ - minZ + CenterX = (maxX + minX) / 2 + CenterY = (maxY + minY) / 2 + CenterZ = (maxZ + minZ) / 2 + return {'cx': "{:.2f}".format(CenterX), + 'cy': '{:.2f}'.format(CenterY), + 'cz': "{:.2f}".format(CenterZ), + 'sx': "{:.2f}".format(SizeX), + 'sy': "{:.2f}".format(SizeY), + 'sz': "{:.2f}".format(SizeZ)} + + +def from_complex_get_box(ligandfile: str, chain: str, ligand_name: str, extending: int = 5) -> dict: + # 从复合物中获取小分子的盒子大小 Get molecule box size from complex + cmd.reinitialize('everything') # ! clean up + cmd.load(ligandfile) # load file + cmd.select('obj_select', f'( resn {ligand_name} ) and chain {chain}') # select ligand as object named sele + box = getvinabox(selection="(obj_select)", extending=extending) + for k, v in box.items(): + box[k] = float(v) + return box + + +cmd.extend("getbox", getbox) +cmd.extend("showbox", showbox) +cmd.extend("autobox", autobox) +cmd.extend("resibox", resibox) +cmd.extend("GetBoxHelp", GetBoxHelp) +cmd.extend("rmhet", rmhet) diff --git a/vinautil/vinautil/main.py b/vinautil/vinautil/main.py new file mode 100644 index 0000000..c6c63b4 --- /dev/null +++ b/vinautil/vinautil/main.py @@ -0,0 +1,3 @@ +import pathlib + +here = pathlib.Path(__file__).parent.resolve() \ No newline at end of file diff --git a/vinautil/vinautil/molecule.py b/vinautil/vinautil/molecule.py new file mode 100644 index 0000000..04ae87c --- /dev/null +++ b/vinautil/vinautil/molecule.py @@ -0,0 +1,86 @@ +from pathlib import Path, PurePath +from openbabel import pybel +from vinautil.vinaconfig import xyz_point + +from vinautil.utils.typecheck import typeassert + + +class center(xyz_point): + def __init__(self, x, y, z): + super().__init__(x, y, z) + + def __repr__(self): + return 'center: {},{},{}'.format(self.x, self.y, self.z) + +class box_size(xyz_point): + def __init__(self, x, y, z): + super().__init__(x, y, z) + + def __repr__(self): + return 'box_size: {},{},{}'.format(self.x, self.y, self.z) + + +@typeassert(file=Path, fmt=str) +class molecule: + + def __init__(self, file: Path, fmt:str=None): + self.file = file + self.fmt = fmt if bool(fmt) else file.suffix.replace('.','') + # self.center, self.box_size = self.compute_box().values() + + def compute_box(self, extending = 10): + coords_lst = self.get_coord_lst() + xlst, ylst, zlst = [i[0] for i in coords_lst], [i[1] for i in coords_lst], [i[2] for i in coords_lst] + ([minX, minY, minZ], [maxX, maxY, maxZ]) = ([min(xlst), min(ylst), min(zlst)], [max(xlst), max(ylst), max(zlst)]) + minX = minX - float(extending) + minY = minY - float(extending) + minZ = minZ - float(extending) + maxX = maxX + float(extending) + maxY = maxY + float(extending) + maxZ = maxZ + float(extending) + SizeX = maxX - minX + SizeY = maxY - minY + SizeZ = maxZ - minZ + CenterX = (maxX + minX) / 2 + CenterY = (maxY + minY) / 2 + CenterZ = (maxZ + minZ) / 2 + c = { + 'x': float("{:.2f}".format(CenterX)), + 'y': float("{:.2f}".format(CenterY)), + 'z': float("{:.2f}".format(CenterZ)), + } + s = { + 'x': float("{:.2f}".format(SizeX)), + 'y': float("{:.2f}".format(SizeY)), + 'z': float("{:.2f}".format(SizeZ)) + } + rd = { + 'center': center(**c), + 'box_size': box_size(**s) + } + return rd + + @property + def center(self): + return self.center + + @property + def box_size(self): + return self.box_size + + def get_coord_lst(self, addHs:bool = True): + molH = pybel.readfile(format = self.fmt, filename=self.file.__str__()) + molH = next(molH) + # molH.OBMol.DeleteHydrogens() + if addHs: molH.OBMol.AddPolarHydrogens() + return [list(atom.coords) for atom in molH] + + def get_coord_dict(self, addHs:bool = True): + molH = pybel.readfile(format = self.fmt, filename=self.file.__str__()) + molH = next(molH) + if addHs: molH.OBMol.AddPolarHydrogens() + return {atom.idx: atom.coords for atom in molH} + +if __name__ == '__main__': + file = Path('./test_file/2xd1_C_CEF.mol2') + mol = molecule(file) \ No newline at end of file diff --git a/vinautil/vinautil/parserPDBQT.py b/vinautil/vinautil/parserPDBQT.py new file mode 100644 index 0000000..a98b877 --- /dev/null +++ b/vinautil/vinautil/parserPDBQT.py @@ -0,0 +1,136 @@ +#!/usr/bin/ehon +# -*- encoding: utf-8 -*- +''' +@file :parserPDBQT.py +@Description: : parser PDBQT file from autodock vina +@Date :2022/10/07 17:08:03 +@Author :hotwa +@version :1.0 + +add 2022 10 07 SumNoHAtom +''' +# from typing import TypeVar, Text +import re +from pathlib import Path +# from io import StringIO,BytesIO +from pdbparse import Bdp + +class qtparser(object): + __slots__ = ["file", "path", "file_stem", "file_obj", "lines_num"] + + def __init__(self,file): + self.file = file + self.path = Path(file).parent + self.file_stem = Path(file).stem + self.file_obj = Path(file) + self.init() + + def init(self): + with open(self.file, 'r') as f: + self.lines_num = len(f.readlines()) + + @property + def module_number(self): + return len(self.getmodules()) + + def getmodules(self): + module_lst = [] + module = [] + f = open(self.file, 'r') + for i in range(self.lines_num): + line = f.readline() + if line.strip() != 'ENDMDL': + module.append(line) + else: + module.append(line) + module_lst.append(module) + module = [] + return module_lst + + def extract_pose(self, sequence:int, out_file:Path=None, overwrite:bool=True): + """ + extract pdbqt pose + :param file: output file + :param sequence: extract pose number + :return: + """ + lst = self.getmodules() + _string = ''.join(lst[sequence]) + if out_file: + _p = Path(out_file) + if not _p.parent.exists(): _p.parent.mkdir(parents=True) + with open(out_file, 'w', encoding='utf-8') as f: + f.write(_string) + return out_file + else: + return _string + + @classmethod + def write_pose(cls, file): + self = cls(file) + lst = self.getmodules() + for i in range(len(lst)): + _string = ''.join(lst[i]) + _p = self.path.joinpath(f'{self.file_stem}') + if not _p.exists(): _p.mkdir(parents = True) + with open(_p.joinpath(self.file_stem + f'_{i}.pdbqt').__str__(), 'w') as f: + f.write(_string) + return self + + def scores(self):# get affinity from pdbqt file + score_lst = [] + module_lst = self.getmodules() + for i,j in enumerate(module_lst): + # print(f'MODEL {i+1} affinity') + for ii in j: + if 'VINA RESULT' in ii: + # print(f'locate line {ii}') + score_lst.append(ii.split()[-3]) + return score_lst + + @property + def energies(self): + return self.scores() + + # 通过pdbqt文件计算非H原子格式 + def read_file(self): + """read_file read pdbqt file as a dict, key is "MODEL N", value is lines. + + [extended_summary] + + Arguments: + file {[type]} -- [description] + + Returns: + [type] -- [description] + """ + d,s,model = {},'',None + f = open(self.file, 'r') + while contentline := f.readline(): + if 'MODEL' in contentline: + model = contentline.strip() + else: + j = bool(contentline.strip() == 'ENDMDL') + if j and model: + d[model] = s + s = '' + if not j: s += contentline + f.close() + return d + + def SumNoHAtom(self, structure_id = 'MODEL 1'): # 计算非H原子个数 + d = self.read_file() + model = d[structure_id].split('\n') + # clean data + model = [i for i in model if i] + model = [i for i in model if i.split()[0] == 'HETATM'] + df = Bdp.transformtodataframe(readlinecontent = model, first_label = 'HETATM', keep_space=False) + atom_lst = [re.match(string = i, pattern='\D*').group(0) for i in df['AtomName'].to_list()] + n = 0 + for i in atom_lst: + if 'H' == i: + ... + else: + n += 1 + return n + diff --git a/vinautil/vinautil/pymolutils/__init__.py b/vinautil/vinautil/pymolutils/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/vinautil/vinautil/pymolutils/add_hydrogen.py b/vinautil/vinautil/pymolutils/add_hydrogen.py new file mode 100644 index 0000000..909d3fa --- /dev/null +++ b/vinautil/vinautil/pymolutils/add_hydrogen.py @@ -0,0 +1,48 @@ +#!/usr/bin/env python +# -*- encoding: utf-8 -*- +''' +@file :add_hydrogen.py +@Description: : add hydrogen in pymol +@Date :2022/10/24 11:17:11 +@Author :hotwa +@version :1.0 +''' +from pymol import cmd +from openbabel import pybel +""" +conda install -c conda-forge pymol-open-source openbabel --yes +""" + +def add_polar_hydrogen(file, out_file,read_fmt = 'mol2', pymol_obj='sele', + save_format='mol2',tool = 'pymol'): + """add_polar_hydrogen _summary_ + + _extended_summary_ + + Arguments: + file {pathlike} -- input file path + out_file {pathlike} -- output file + + Keyword Arguments: + read_fmt {string} -- the format of read file (default: {'mol2'}) + pymol_obj {string } -- add polar hydrogen object name, ligand ID (default: {'sele'}) + save_format {string} -- the format of save file (default: {'mol2'}) + tool {string} -- the tool of add polar hydrogens (default: {'pymol'}) + + Returns: + _type_ -- path or None + """ + if tool == 'pymol': + cmd.reinitialize("everything") + cmd.load(filename=file, format=read_fmt) + cmd.h_add(pymol_obj) # add all hydrogens in this molecular + cmd.remove(f"{pymol_obj} & hydro & not nbr. (don.|acc.)") # remove no-polar-hydrogen + cmd.save(filename=out_file,selection=pymol_obj, format=save_format) + return out_file + elif tool == 'pybel': + omol = list(pybel.readfile(format = 'mol2', filename = file))[0] + omol.OBMol.AddPolarHydrogens() + omol.write('mol2',out_file,overwrite=True) + return out_file + else: + return None diff --git a/vinautil/vinautil/pymolutils/biotools.py b/vinautil/vinautil/pymolutils/biotools.py new file mode 100644 index 0000000..3fd96c7 --- /dev/null +++ b/vinautil/vinautil/pymolutils/biotools.py @@ -0,0 +1,465 @@ +from email import header +from unittest.mock import Base + + +#!/usr/bin/env python +# -*- encoding: utf-8 -*- +''' +@file :biotools.py +@Description: : +@Date :2021/06/22 16:36:34 +@Author :hotwa +@version :1.0 +''' +import os,re,time +from Bio.PDB.PDBParser import PDBParser +from Bio.PDB import PDBIO,Select,parse_pdb_header,PDBList +from tools import mdcheck,get_path_contents +import requests +import random,string +from pathlib import Path + +file_dir = os.path.dirname(os.path.realpath(__file__)) # 当前python文件的绝对路径 +residue = [i.upper() for i in 'Gly,Ala,Val,Leu,Ile,Pro,Phe,Tyr,Trp,Ser,Thr,Cys,Met,Asn,Gln,Asp,Glu,Lys,Arg,His'.split(',')] + +""" getpdbfile function""" +agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36 -- '+''.join(random.sample(string.ascii_letters+string.digits, 32)) +HEADER = {'User-Agent': agent} + +def getpdbfile(pdbid,dir): # 下载备用,不是很稳定 + pdbid = pdbid.upper() + res = os.path.exists(f'{pdbid}.pdb') + if not res: + r = requests.get(url = f'https://files.rcsb.org/download/{pdbid}.pdb1',verify=False,headers=HEADER) + _file_path = os.path.join(dir,f'{pdbid}.pdb') + with open(_file_path,'wb+') as f: + f.write(r.content) + +""" getpdbfile function""" + + +class PDBLengthError(BaseException): + def __init__(self,arg): + self.arg = arg + +def downloadpdbfile(pdbid,pdir,overwrite=True,changename=True): + """downloadpdbfile [summary] + + [extended_summary] + + :param pdbid: like code 5xv7 + :type pdbid: string + :param pdir: save pdb file path + :type pdir: string + :param overwrite: overwrite pdb file, defaults to True + :type overwrite: bool, optional + :param changename: change .ent to .pdb, defaults to True + :type changename: bool, optional + """ + if isinstance(pdbid,list): + for i in pdbid: + downloadpdbfile_sub(pdbid=i,dir = pdir,overwrite=overwrite,changename=changename) + elif isinstance(pdbid,str): + if len(pdbid) == 4: + downloadpdbfile_sub(pdbid=pdbid,dir = pdir,overwrite=overwrite,changename=changename) + else: + raise PDBLengthError('pdbid length error!') + + +def downloadpdbfile_sub(pdbid,dir,overwrite,changename): + _pdbname = f'pdb{pdbid}.ent' + _pdbname_new = f'{pdbid}.pdb' + if not overwrite: + _path = os.path.join(dir,_pdbname_new) + if os.path.exists(_path): + ... + else: + pdbl = PDBList() + try: + pdbl.retrieve_pdb_file(pdbid,file_format='pdb',pdir = dir,overwrite=overwrite) + except FileNotFoundError: # 调用biopython下载接口失败,使用自己的接口 + getpdbfile(pdbid,dir) + if changename: + srcFile = os.path.join(dir,_pdbname) + dstFile = os.path.join(dir,_pdbname_new) + try: + os.rename(srcFile,dstFile) + except FileExistsError as e: # 如果重名名的文件存在,则表示该文件已经下载 + ... + except Exception as e: + raise e + elif overwrite: # 覆盖之前已经下载的pdb文件 + pdbl = PDBList() + try: + pdbl.retrieve_pdb_file(pdbid,file_format='pdb',pdir = dir,overwrite=overwrite) + except FileNotFoundError: # 调用biopython下载接口失败,使用自己的接口 + getpdbfile(pdbid,dir) + if changename: + srcFile = os.path.join(dir,_pdbname) + dstFile = os.path.join(dir,_pdbname_new) + try: + os.rename(srcFile,dstFile) + except Exception as e: + raise e + else: + print('error overwrite params!') + +class ChainSelect(Select): + def __init__(self,chain_string='A'): + super(Select,self).__init__() + self.chain_string = chain_string + + def accept_chain(self, chain): + if str(chain.__repr__()) == ''.format(self.chain_string): # judge chain name + return 1 + else: + return 0 + +class ResidueSelect(Select): + """ResidueSelect ues by select + + [extended_summary] + + :param Select: [description] + :type Select: [type] + """ + def __init__(self,residue): + super(Select,self).__init__() + self.residue = residue + + def accept_residue(self, residue): + if residue.get_resname()==self.residue: + return 1 + else: + return 0 + +def return_path(path): + # 判断是否为绝对路径 + path_split_string = os.path.split(path) + if os.path.isdir(path_split_string[0]) and os.path.isfile(path): + return path # 输入绝对路径,返回绝对路径 + elif bool(path_split_string[0]) == False: + return os.path.join(file_dir,path) # 尝试在当前目录下自动补全路径 + elif not (path_split_string[1][-4:] == '.pdb' or '.ent'): + raise ValueError(f'this file, {path} is not a valid pdb file') + else: + raise ValueError(f'valid path: {path}') + +# ! error class for Bdp +class DoiError(BaseException): + def __init__(self,arg): + self.arg = arg + +class ChainError(BaseException): + def __init__(self,arg): + self.arg = arg + + +class Bdp(object): + __slots__ = ['path','sid','mole_id','mole_struct','create_path'] + + def __init__(self,path,sid): + self.path = path + self.create_path = os.path.dirname(os.path.dirname(path)) + self.sid = sid + self.mole_id = [] + self.mole_struct = [] + # 初始化数据 + self.read_formula() + self.init_path() + + @property + def chain_num(self): + s = self.read_struct() + first_model = s[0] + return len(first_model) + + @property + def header_infos(self): + return parse_pdb_header(self.path) + + @property + def doi(self): + _journal = self.header_infos['journal'] + _doi = _journal.split()[-1] + if _doi.startswith('10.'): + return _doi + else: + raise DoiError(f'current pdb does not doi, {_journal}') + + def mkdir(self,pname,md = False): + """mkdir [summary] + + [extended_summary] + + :param pname: [description] + :type pname: [type] + :param md: [dict], defaults to False + :type md: [dict], optional + """ + def wrapper(func): + def deco(self,*arg,**kwargs): + create_path = os.path.join(self.create_path,pname) + if os.path.exists(create_path): + ... + else: + os.mkdir(create_path) + # create markdown + mdc = mdcheck(md) + header_info = f'''--- + title: {mdc['title']} + date: {time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())} + type: {mdc['typeinfo']} + --- + [TOC] + # Description + {mdc['description']} + ''' + create_path_md = os.path.join(create_path,'README.md') + with open(create_path_md,'w+') as f: + f.write(header_info) + func(self,*arg,**kwargs) + return deco + return wrapper + + def value_check(self): + try: + if len(self.mole_id) == len(self.mole_struct): + print('check pass!') + else: + raise ValueError('molecule identifier do not match formula identifier! manual check') + except : + raise ValueError + + @staticmethod + def read_line_list(first_column,path): + """read_line_list 读取第一列为first_column字符串的行,存为列表返回 + + [extended_summary] + + Arguments: + first_column {[string]} -- [pdb文件内容第一列常常为大写] + path {[string]} -- [路径] + + Returns: + [iter] -- [含有first_column字符串的生成器] + """ + stringiter = Bdp.stringlinesiter(file = path) + stringiter = map(lambda x:x.strip(), stringiter) + return filter(lambda x:x.split()[0]==first_column,stringiter) + + @staticmethod + def stringlinesiter(file): + with open(file, 'r+', encoding='utf-8') as f: + yield from f.readlines() + + @mkdir(self = 'self',pname = 'split_molecule') + def split_molecule(self,residue): + """split_molecule [split molecule or residue line save to file] + + [extended_summary] + + :param residue: [select residue] + :type residue: [string] + :extract_chain: extract molecule corresponding chain + :type extract_chain: bool + """ + remove_list,residue = [],str(residue) + s = self.read_struct() + chainlist = self.getchain_str() + # 对每条链上都与这个小分子结合的链上的结合信息都提取出来 + for i in chainlist: + io = PDBIO() + io.set_structure(s[0][i]) + sidc = self.sid.replace('.pdb', '') if '.pdb' in self.sid else self.sid + savename = f'{sidc}_{i}_{residue}.pdb' + path = os.path.join(self.create_path,'split_molecule',savename) + io.save(path,ResidueSelect(residue)) + if os.path.exists(path): + with open(path,'r+') as f: + content = f.readline() + if content.strip() == 'END': + # 该链没有小分子,删除文件 + # print(f'{savename} this chain have not molecule remove it') + remove_list.append(path) + for i in remove_list: + os.remove(i) # remove the empty molecule file + + + def getchain_str(self): + """getchain_str retrun chain symbol name + + [extended_summary] + + :return: [description] + :rtype: list + """ + _l = [] + p = PDBParser(PERMISSIVE=1) + s = p.get_structure(self.sid, self.path) + chain_gennerate = s[0].copy().get_chains() + for i in chain_gennerate: + _l.append(i.id) + _l.sort() + return _l + + def site_exists(self,residue,num): + """site_exists 判断改蛋白的某个位点是否存在,并返回该位点所在的链 + + 用于判断结合位点的链在哪里 + + :param residue: 残基名称 + :type residue: string + :param num: 残基编号 + :type num: int + :return: chain + :rtype: list + """ + _l = [] + num = int(num) + residue = residue.strip().upper() + chainlist = self.getchain_str() + s = self.read_struct() + for i in chainlist: + try: + resname_from_pdb = s[0][i][num].get_resname() + if residue == resname_from_pdb: + _l.append(i) + except KeyError as e: + print(e,f'chain {i} not exist {residue}-{num}') + return _l + + def _split_chain_save_struct(self,obj,sid): + io = PDBIO() + io.set_structure(obj) + sidc = sid.replace('.pdb', '') if '.pdb' in sid else sid + name,path = f'{sidc}_{obj.id}.pdb', self.create_path + io.save(os.path.join(path,'split_chain',name),ChainSelect(obj.id)) + + @mkdir(self = 'self',pname ='split_chain') + def split_chain(self,extract='auto'): + """split_chain [summary] + + [extended_summary] + + :param extract: [extract the chain], defaults to 'auto' extract all chains + :type extract: [string], optional + :raises ValueError: [save to pdb in current path in ./splitchain] + """ + # 将传入的蛋白文件按照蛋白链分割并按照 pdbid_A_m.pdb 重新命名 A代表链,m表示该链含有小分子 + p = PDBParser(PERMISSIVE=1) + s = p.get_structure(self.sid, self.path) + if extract == 'auto': + chainlist = self.getchain_str() + for j in chainlist: + self._split_chain_save_struct(obj=s[0][j],sid=self.sid) + else: + chainlist = self.getchain_str() + if extract not in chainlist: + raise ChainError(f'The {extract} chain not exists !') + if extract in chainlist: + extract = extract.upper() + self._split_chain_save_struct(obj=s[0][extract],sid=self.sid) + else: + raise ValueError + + def read_struct(self): + """read_struct read pdb structure + + https://biopython-cn.readthedocs.io/zh_CN/latest/cn/chr11.html#id3 + 一个 Structure 对象的整体布局遵循称为SMCRA(Structure/Model/Chain/Residue/Atom,结构/模型/链/残基/原子)的体系架构: + + 结构由模型组成 + 模型由多条链组成 + 链由残基组成 + 多个原子构成残基 + Returns: + [obj] -- [Model] + """ + p = PDBParser(PERMISSIVE=1) + s = p.get_structure(self.sid, self.path) + return s + + def get_chain(self,chain_str): + s = self.read_struct() + return s[0][chain_str] + + def init_path(self): + _p = return_path(self.path) + self.path = _p + if os.path.isfile(_p) == False: + raise FileNotFoundError(f'file {_p} not found') + + def read_formula(self): + """read_formula read pdb file molecule information and identifier in this pdb file(just like residue) + + [extended_summary] + + :raises OSError: Do not find this file + """ + _path = self.path + try: + with open(_path,'r') as f: + text_line = f.readlines() + _l = [] + for i in text_line: + li = i.split() + if li[0] == 'FORMUL': + _l.append(li) + self.mole_id.append(li[2]) + for ii in _l: + self.mole_struct.append(''.join(ii[3:])) + except Exception as e: + raise e + + def clean_pdb(self,outfile=None,path=None): + __lst = Bdp.read_line_list('ATOM',self.path) + _p = Path(path) if path else Path(self.path).absolute().parent + _f = outfile if outfile else f"{self.sid}.clean.pdb" + write_path = _p.joinpath(_f) + if write_path.exists(): write_path.unlink() + with open(write_path,'w+') as f: + for i in __lst: + f.write(f"{i}\n") + +def renamepdbfile(path): + """renamepdbfile 自动对path路径中批量下载的pdb中文件的文件名进行重命名 + + [extended_summary] + + Arguments: + path {[type]} -- [description] + """ + _flist = get_path_contents(path)[0] + for i in _flist: + i = i.strip() + basetuple = os.path.split(i) + basedir = basetuple[0] + _fn = basetuple[1] + res = re.match(pattern='pdb([A-Za-z0-9]*).ent',string=_fn) + # nfn = res.group(1) + # newname =os.path.join(basedir,str(nfn)+'.pbd') + # os.rename(src = i,dst = newname) + try: + nfn = res.group(1) + newname =os.path.join(basedir,str(nfn)+'.pdb') + os.rename(src = i,dst = newname) + except IndexError: + raise IndexError(f'Could not find pdbid in this filename: {i}') + except AttributeError: + continue + except FileNotFoundError as e: + if os.path.exists(i): + raise e + elif os.path.exists(newname): + continue + else: + raise e + + + +def pdbupdate(path): + pl = PDBList(pdb=path) + pl.update_pdb() + + diff --git a/vinautil/vinautil/pymolutils/mutagenesis.py b/vinautil/vinautil/pymolutils/mutagenesis.py new file mode 100644 index 0000000..c125b97 --- /dev/null +++ b/vinautil/vinautil/pymolutils/mutagenesis.py @@ -0,0 +1,66 @@ +# 定义残基突变函数 +from pymol import cmd +from pathlib import Path +def Mutagenesis_site(filename: Path, mutation_type: str, site: int, outfile: Path = None) -> Path: + """Mutagenesis_site Mutagenesis residue in site + residue site have mutil conformations, need to select one conformations, some error accured. + pass + _extended_summary_ + + Arguments: + filename {str} -- PDB file format + mutation_type {str} -- 'VAL' for ALA TO VAL; 'ALA' for any/not ALA to ALA; 'GLY' for ALA to GLY + site {int} -- residue site in pdbfile + + Keyword Arguments: + outfile {str} -- _description_ (default: {None}) + + Raises: + ValueError: not one object in PDBs,need to fix + + Returns: + str -- save mutagenesis file path + """ + p = Path(filename) + savename = p.stem + f"_{site}_mutation.pdb" + _out_file = Path(outfile) if outfile else p.absolute().parent.joinpath(savename) + if not _out_file.absolute().parent.exists(): _out_file.absolute().parent.mkdir(parents=True) + cmd.reinitialize('everything') # ! clean up + cmd.load(filename.as_posix()) + PDBs = cmd.get_names() + # Get the ID numbers of c-alpha (CA) atoms of all residues + if len(PDBs) == 1: + PDB = PDBs[0] + else: + raise ValueError(f'this pdb have more than one object!PDBs:{PDBs}') + CAindex = cmd.identify(f"{PDB} and name CA") + pdbstrList = [cmd.get_pdbstr("%s and id %s" % (PDB, CAid)).splitlines() for CAid in CAindex] + ProtChainResiList = [[PDB, i[0][21], i[0][22:26].strip()] for i in pdbstrList] + for i, j, k in ProtChainResiList: + if int(k) == int(site): + cmd.wizard("mutagenesis") + # print(i,j,k) + cmd.refresh_wizard() + cmd.get_wizard().set_mode(mutation_type) + ##Possible mutation_type could be: + ##'VAL' for ALA TO VAL + ##'ALA' for any/not ALA to ALA + ##'GLY' for ALA to GLY + # 'selection' will select each residue that you have selected + # on ProtChainResiList above using the PDBid,chain,and residue + # present on your pdb file.If you didn't select a range on + # ProteinChainResiList, it will do the mutation on all the residues + # present in your protein. + selection = f"/{i}//{j}/{k}" + # Print selection to check the output + # print(selection) + # Selects where to place the mutation + cmd.get_wizard().do_select(selection) + ##Applies mutation + cmd.get_wizard().apply() + # Save each mutation and reinitialize the session before the next mutation + ##to have pdb files only with the residue-specific single-point mutation you were interested. + cmd.set_wizard("done") + cmd.save(_out_file.as_posix(), f"{PDB}") + cmd.reinitialize('everything') # Reinitialize PyMOL to default settings. + return _out_file \ No newline at end of file diff --git a/vinautil/vinautil/pymolutils/pymol_color_plugin.py b/vinautil/vinautil/pymolutils/pymol_color_plugin.py new file mode 100644 index 0000000..ffc7b1a --- /dev/null +++ b/vinautil/vinautil/pymolutils/pymol_color_plugin.py @@ -0,0 +1,140 @@ +#!/usr/bin/env python +# -*- encoding: utf-8 -*- +''' +@file :pymol_color_plugin.py +@Description: :pymol配色方案 +@Date :2023/6/1 10:52:25 +@Author :hotwa +@version :1.0 +''' + + +from pymol import cmd +from pathlib import Path + +class colorp(): + color1 = (('color1', '[186,182,217]'), 'purple') + color2 = (('color2', '[233,195,153]'), 'yellow') + color3 = (('color3', '[43,113,216]'), 'blue-N') + color4 = (('color4', '[206,155,198]'), 'purple') + color5 = (('color5', '[251,187,62]'), 'orange') + color6 = (('color6', '[245,157,158]'), 'red') + color7 = (('color7', '[133,188,135]'), 'green') + color8 = (('color8', '[30,230,30]'),'green-CL') # Cl卤素配色 + color9 = (('color9', '[141,215,247]'),'blue-C') # C配色 + color10 = (('color10', '[0,132,55]'),'green-F') # F卤素配色 + grey1 = ('grey1','[224,224,224]') + colors = (color1, color2, color3, color4, color5, color6, color7, color8, color9, color10) + + def __init__(self, path, id): + self.id = id.lower() + cmd.reinitialize() + p = Path(path) + if p.is_dir(): + file = p.joinpath(f"{id}.pdb") + else: + raise ValueError('path params error') + cmd.load(file,id) + + def pretty(self): + cmd.remove('solvent metals') # 移除金属离子和溶剂 + cmd.remove("resn SO4 or resn PO4 or resn CL") + cmd.pretty(selection=self.id) + + @staticmethod + def defcolor(): + # 定义常用配色 + color_gennerate = map(lambda x:x[0],colorp.colors) + list(map(lambda x:cmd.set_color(*x),color_gennerate)) + # 定义灰度配色 + cmd.set_color(*colorp.grey1) + + @staticmethod + def grey(obj='sele',opt=True): + colorp.defcolor() + if opt: method.optimisation() # 优化 + # 对某个对象进行灰化 + cmd.color('grey1',f'{obj}') + + @staticmethod + def color_mole(obj='hetatm'): + # color blue,sele in name c* + colorp.defcolor() + cmd.color('color9',f'{obj} in name c*') + cmd.color('color6',f'{obj} in name o*') + cmd.color('color5',f'{obj} in name s*') + cmd.color('color3',f'{obj} in name n*') + cmd.color('color8',f'{obj} in name cl*') + cmd.color('color10',f'{obj} in name f*') + + @staticmethod + def pretty(obj = None): + if not obj: obj = cmd.get_names()[0] + cmd.remove('solvent metals') # 移除金属离子和溶剂 + cmd.pretty(selection=obj) + + +class font(): + font_size = 28 # 单位就是正常的px。你也可以用负值,则单位是Å + +class mole(): + stick_radius=0.10 + +class atom(): + size = 0.28 + +class method(): + + @staticmethod + def optimisation(grey=True, selection='sele'): + colorp.defcolor() + if grey: cmd.set('cartoon_color','grey1', selection=selection) + cmd.set('stick_transparency','0.1', selection=selection) + cmd.set("ray_shadows","off", selection=selection) + cmd.set('cartoon_highlight_color', selection=selection) + cmd.set('cartoon_fancy_helices', selection=selection) + cmd.set('cartoon_transparency','0.5', selection=selection) + + @staticmethod + def remove(): + cmd.remove('resn SO4') # SO4 remove + cmd.remove('solvent metals') # 移除金属离子和溶剂 + +def get_resn(name:str = 'all'): + resn_list = [] + cmd.iterate(selection=name, expression=lambda atom: resn_list.append(atom.resn)) + return tuple(set(resn_list)) + + +def protgrey(): + method.remove() + method.optimisation() + +# 观察小分子 +# protgrey() +# colorp.color_mole('sele') +# add label +# label sele, "your label" +# black background +# set bg_rgb, [0, 0, 0] +# define label +# cmd.get_position("sele") +# set label_position, [x,y,z] +# cmd.h_add(pymol_obj) # add all hydrogens in this molecular +# cmd.remove(f"{pymol_obj} & hydro & not nbr. (don.|acc.)") # remove no-polar-hydrogen +# cmd.remove(f"sele & hydro") # 移除所有的H +# cmd.set('cartoon_color','color8', 'obj') +# cmd.hide("everything", "chain A and resn UHT and alt B") # 小分子隐藏B构象 +# cmd.label("chain A and resn UHT and alt A", "resn") # 添加标签 +#cmd.select('sele', f'resn {resn} and chain {chain}') +#cmd.center('sele') +#cmd.orient('sele') +#obj_name = 'around_and_self' +# cmd.create(obj_name, f'(resn {resn} and chain {chain}) around {distance}') # 不扩展残基 +# cmd.create(obj_name, f'byres (resn {resn} and chain {chain}) around {distance}') # 扩展残基 +# cmd.create(obj_name, f'(resn {resn} and chain {chain}) around {distance} or (resn {resn} and chain {chain})') # 选择resn的对象和resn对象周围3A的原子 +#cmd.create(obj_name, f'byres (resn {resn} and chain {chain}) around {distance} or (resn {resn} and chain {chain})') # 选择resn的对象和resn对象周围3A的原子扩展至残基的原子 +#cmd.hide('everything', 'all') +#cmd.show('lines', obj_name) +cmd.extend('colorp',colorp) +cmd.extend('method',method) \ No newline at end of file diff --git a/vinautil/vinautil/pymolutils/pymol_plugin.py b/vinautil/vinautil/pymolutils/pymol_plugin.py new file mode 100644 index 0000000..d8da346 --- /dev/null +++ b/vinautil/vinautil/pymolutils/pymol_plugin.py @@ -0,0 +1,498 @@ +#!/usr/bin/env python +# -*- encoding: utf-8 -*- +''' +@file :auto.py +@Description: :自动处理脚本 +@Date :2021/09/02 15:19:15 +@Author :hotwa +@version :1.0 +https://zhuanlan.zhihu.com/p/121215784 # PyMOL 选择器的语法参考 +https://blog.csdn.net/u011450367/article/details/51815130 +resource: https://www.cnblogs.com/wq242424/p/13073222.html +http://blog.sina.com.cn/s/blog_7188922f0100wbz1.html +https://www.jianshu.com/p/3396e94315cb +https://blog.csdn.net/dengximo9047/article/details/101221495 # 疏水表面 +http://www.mdbbs.org/thread-4064-1-1.html # 用来寻找配体口袋的残基 +''' + +from functools import partial +from pymol import cmd +from pathlib import Path +import pandas as pd +from json import dumps +import numpy as np +from biotools import Bdp +from loguru import logger +# from types import FunctionType,CodeType + +logger.add('pymol_plugin_{time}.log') + +# select ligand, resn x +# cmd.select('ligand','byres 5UEH within 6 of resn GOL resn 85P') +def autoshow(i,path,distance = 6,ionshow = False): + """autoshow 自动展示所有配体6A以内的lines,方便查找共价键 + + [extended_summary] # cmd.create('pocket',f'byres {i} within {distance} of {rawstring}') # 创建一个在当前pdb对象中配体残基周围距离为6A的口袋对象 + + Arguments: + i {[string]} -- [pdbid 例如 5EA9] + + Keyword Arguments: + path {[string]} -- [pdb文件存放的目录,目前支持后缀为.pdb的文件,也可以在全局变量中设置好文件目录] (default: {path}) + distance {[int]} -- [显示周围原子的距离参数] (default: {6}) + ionshow {[bool]} -- [离子和有机小分子周围共价结合观察使用,默认不展示] (default: {False}) + + Returns: + [list] -- [返回ligid标识符,除去了部分离子] + """ + cmd.reinitialize() + p = Path(path) + file = p.joinpath(f"{i}.pdb") + cmd.load(file,i) + cmd.remove('solvent metals') # 移除金属离子和溶剂 + mole = moleculeidentity(f"{i}",path) + rawstring = 'resn ' + ' resn '.join(mole.ligIdNoion) + print(f'{rawstring} around {distance}') + cmd.select('ligand',f'{rawstring}') + cmd.select('ligand_around',f'{rawstring} around {distance}') # 选择一个在当前pdb对象中配体残基周围距离为6A的口袋对象 + if ionshow: # 是否显示所有记录小分子HET条记录中的信息,对于离子与有机物显示相关共价键有效 + cmd.show('lines','ligand_part') # 显示所有HET侧链 + cmd.create('ligand_part',f'{rawstring} expand {distance}') # 单独显示小分子扩展6A周围的lines对象 + cmd.create('organ',f'organic expand {distance}') + cmd.show('lines','organ') + return mole.ligId + +def bejson(d:dict(help='need to beauty json')) -> dict: + return dumps(d,indent=4,ensure_ascii=False) + +class covalentidentity(): + """covalentidentity 使用pymol进行识别共价键,在输出有机小分子6A以内的原子时可能出现分子的构象发生改变导致共价键距离计算有问题,还可能是与金属离子形成共价键 + """ + # __slots__ = [] + + def __init__(self,pdbfilename,pdbid,path): + """__init__ [summary] + + [extended_summary] + + Arguments: + pdbfilename {[string]} -- pdb文件名称 + pdbid {[string]} -- 4 length id + path {[string or pathlib obj]} -- pdb文件路径 + """ + self.pdbfilename = pdbfilename + self.pdbid = pdbid + self.path = Path(path) + self.init() + + @staticmethod + def cleanpdb(before_string): + """ + 处理传入参数pdb 例如: pdb1b12.ent 1b12.pdb 转化成 1b12 + """ + # print(f'try to format {before_string}') + if '.ent' in before_string: + before_string = before_string[3:-4].lower() + elif '.pdb' in before_string: + before_string = before_string[:4].lower() + elif len(before_string) == 4: + before_string = before_string.lower() + else: + if len(before_string) != 4: + raise ValueError(f'length out of 4 {before_string}') + return before_string + + @classmethod + def export_organ(cls,pdbfile,path,distance = 6,remove_water_metals = True): + # 输出小分子结合部分的共价结合信息对象,pdb格式 + file = path.joinpath(pdbfile) + _pdbid = covalentidentity.cleanpdb(pdbfile) + savepath = path.joinpath(f'{_pdbid}_organ_around_{distance}_lines.pdb') + cmd.reinitialize() + cmd.load(filename = file,object = pdbfile) + if remove_water_metals: cmd.remove('solvent metals') # 移除金属离子和溶剂 + cmd.create('organ',f'organic expand {distance}') + cmd.show('lines','organ') + cmd.save(filename=savepath,selection='organ') + cls.connect_info_pdb_path = savepath + return savepath + + def init(self): + self.__create_dataframe() + + def __read_organ_pdb(self): + exportfile = covalentidentity.export_organ(self.pdbfilename,path = Path(self.path)) + with open(file=exportfile,mode = 'r') as f: + read_list = [i.replace('\n', '') for i in f.readlines()] + self.read_connect = [i for i in read_list if i[:6].strip()=='CONECT'] + self.read_hetatm = [i for i in read_list if i[:6].strip()=='HETATM'] + self.read_atom = [i for i in read_list if i[:6].strip()=='ATOM'] + + def __create_dataframe(self): + self.__read_organ_pdb() # 读取初始化数据 + connect_infos = [] + for i in self.read_connect: #清洗数据 + if len(i.split()) == 3: # 策略:查找只有两个原子键的连接信息 + connect_infos.append(i) + self.df = self.transformtodataframe(self.read_hetatm).append(self.transformtodataframe(self.read_atom)) + target_search_connect = [] + for i in connect_infos: + l = [i for i in i.split() if i != 'CONECT'] + target_search_connect.append(l) + self.target_search_connect = target_search_connect # 两个连接信息的列表,不一定都是共价键 + + def __search_convenlent(self,distance_accuracy=1): + true_covlent = [] # 共价键连接信息记录 有几个列表就有几个共价键链接信息 + infos = [] + locateinfos = {} + for item in self.target_search_connect: # 对两个连接原子进行判断 + df = self.df + # molecule chain search + res0 = df[df['AtomNum']==item[0]] + res1 = df[df['AtomNum']==item[1]] + if (res0['Identifier'].all() == res1['Identifier'].all()): + continue + else: + true_covlent.append(item) + # 定位信息那个是小分子的行信息和那个是蛋白行的信息 + if res0['Identifier'].all() == 'ATOM': locateinfos['ATOM'] = res0 + if res0['Identifier'].all() == 'HETATM': locateinfos['HETATM'] = res0 + if res1['Identifier'].all() == 'ATOM': locateinfos['ATOM'] = res1 + if res1['Identifier'].all() == 'HETATM': locateinfos['HETATM'] = res1 + assert (not res0.empty and not res1.empty),( + 'this code occur a bug' + 'try to connet to author fix it' + 'maybe is pymol output connect infos error' + 'never be happen' + ) + partinfos = self.__fill_infos(i=item,pro=locateinfos['ATOM'],mole=locateinfos['HETATM'],distance_accuracy=distance_accuracy) + infos.append(partinfos) + return infos + + def convenlent_infos(self,distance_accuracy=1): + return self.__search_convenlent(distance_accuracy=distance_accuracy) + + @property + def cov_infos(self): + d = self.__search_convenlent() + return d + + @staticmethod + def __fill_infos(i,pro,mole,distance_accuracy=1): + distance = np.sqrt(np.square(float(pro['X'].all())-float(mole['X'].all())) + np.square(float(pro['Y'].all())-float(mole['Y'].all())) + np.square(float(pro['Z'].all())-float(mole['Z'].all()))) + infostmplate = { + 'ConnectInfos': i, + 'ResidueName':pro['ResidueName'].all(), + 'ResidueSequence':pro['ResidueSequence'].all(), + 'ChainIndentifier':pro['ChainIndentifier'].all(), + 'CovenlentAtom':{ + 'protein': { + 'AtomName':pro['AtomName'].all(), + 'ElementSymbol':pro['ElementSymbol'].all(), + }, + 'molecule':{ + 'AtomName':mole['AtomName'].all(), + 'ElementSymbol':mole['ElementSymbol'].all(), + }, + }, + 'CovenlentDistance':round(distance,distance_accuracy), + 'LigandName':mole['ResidueName'].all(), + } + return infostmplate + + @staticmethod + def transformtodataframe(readlinecontent=None,first_label = 'ATOM',path = False): + path_info = (Path(path).name.__str__(),Path(path).parent.__str__()) if path else ('unknown file name','unknown path') + if path: readlinecontent = Bdp.read_line_list(first_column=first_label,path=path) + if readlinecontent == None: raise ValueError('readlinecontent need') + if first_label == ('ATOM' or 'HETATM'): # ! 注意python中的懒惰运算,及运算符特性 + # 将pdb每行读取的信息转化为dataframe 针对ATOM HETAM开头的行适用 + Identifier,AtomNum,AtomName,AtomLocationIndicator,ResidueName,ChainIndentifier,ResidueSequence,InsertionsResidue,X,Y,Z,Occupancy,Tfactor,SegmentIdentifier,ElementSymbol = [],[],[],[],[],[],[],[],[],[],[],[],[],[],[] + for i in readlinecontent: + Identifier.append(i[:7].strip()) + AtomNum.append(i[6:11].strip()) + AtomName.append(i[12:16].strip()) + AtomLocationIndicator.append(i[16].strip()) + ResidueName.append(i[17:20].strip()) + ChainIndentifier.append(i[21].strip()) + ResidueSequence.append(i[22:26].strip()) + InsertionsResidue.append(i[26].strip()) + X.append(i[30:38].strip()) + Y.append(i[38:46].strip()) + Z.append(i[46:54].strip()) + Occupancy.append(i[54:60].strip()) + Tfactor.append(i[60:66].strip()) + SegmentIdentifier.append(i[72:76].strip()) + ElementSymbol.append(i[76:78].strip()) + df = pd.DataFrame(data={ + 'Identifier':Identifier, + 'AtomNum':AtomNum, + 'AtomName':AtomName, + 'AtomLocationIndicator':AtomLocationIndicator, + 'ResidueName':ResidueName, + 'ChainIndentifier':ChainIndentifier, + 'ResidueSequence':ResidueSequence, + 'InsertionsResidue':InsertionsResidue, + 'X':X, + 'Y':Y, + 'Z':Z, + 'Occupancy':Occupancy, + 'Tfactor':Tfactor, + 'SegmentIdentifier':SegmentIdentifier, + 'ElementSymbol':ElementSymbol + }) + elif first_label == 'LINK': + # https://www.wwpdb.org/documentation/file-format-content/format33/sect6.html + Record_name,Atom_name1,Alternate_location_indicator1,Residue_name1,Chain_identifier1,Residue_sequence_number1,Insertion_code1,Atom_name2,Alternate_location_indicator2,Residue_name2,Chain_identifier2,Residue_sequence_number2,Insertion_code2,Symmetry_operator_atom_1,Symmetry_operator_atom_2,Link_distance = [],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[] + for i in readlinecontent: + if len(i) != 78: + logger.exception( + f"[LINK label length error] not match the LINK line length\n" + f"[input string]:{i}\n" + f"check your input file:{path_info[0]} from {path_info[1]}\n" + "if this line not have link distance, try to caculate by covalent.bypymol method") + pass + else: + Record_name.append(i[:6].strip()) + Atom_name1.append(i[12:16].strip()) + Alternate_location_indicator1.append(i[16].strip()) + Residue_name1.append(i[17:20].strip()) + Chain_identifier1.append(i[21].strip()) + Residue_sequence_number1.append(i[22:26].strip()) + Insertion_code1.append(i[26].strip()) + Atom_name2.append(i[42:46].strip()) + Alternate_location_indicator2.append(i[46].strip()) + Residue_name2.append(i[47:50].strip()) + Chain_identifier2.append(i[51].strip()) + Residue_sequence_number2.append(i[52:56].strip()) + Insertion_code2.append(i[56].strip()) + Symmetry_operator_atom_1.append(i[59:65].strip()) + Symmetry_operator_atom_2.append(i[66:72].strip()) + Link_distance.append(i[73:78].strip()) + df = pd.DataFrame(data={ + 'Record_name':Record_name, + 'Atom_name1':Atom_name1, + 'Alternate_location_indicator1':Alternate_location_indicator1, + 'Residue_name1':pd.Series(Residue_name1,dtype='object'), + 'Chain_identifier1':Chain_identifier1, + 'Residue_sequence_number1':Residue_sequence_number1, + 'Insertion_code1':Insertion_code1, + 'Atom_name2':Atom_name2, + 'Alternate_location_indicator2':Alternate_location_indicator2, + 'Residue_name2':pd.Series(Residue_name2,dtype='object'), + 'Chain_identifier2':Chain_identifier2, + 'Residue_sequence_number2':Residue_sequence_number2, + 'Insertion_code2':Insertion_code2, + 'Symmetry_operator_atom_1':Symmetry_operator_atom_1, + 'Symmetry_operator_atom_2':Symmetry_operator_atom_2, + 'Link_distance':pd.Series(Link_distance,dtype='float32'), + }) + else: + assert False,( + 'No return', + 'a error occurred' + ) + return df + +class color_plan(): + color1 = (('color1', '[186,182,217]'), 'purple') + color2 = (('color2', '[233,195,153]'), 'yellow') + color3 = (('color3', '[141,215,247]'), 'blue') + color4 = (('color4', '[206,155,198]'), 'purple') + color5 = (('color5', '[251,187,62]'), 'orange') + color6 = (('color6', '[245,157,158]'), 'red') + color7 = (('color7', '[133,188,135]'), 'green') + colors = (color1, color2, color3, color4, color5, color6, color7) + + @staticmethod + def defcolor(): + color_gennerate = map(lambda x:x[0],color_plan.colors) + list(map(lambda x:cmd.set_color(*x),color_gennerate)) + +# error class +class PathError(BaseException): + def __init__(self,arg): + self.arg = arg + +class moleculeidentity(): + """moleculeidentity [summary] + + [识别pdb蛋白文件中的小分子标识符] + """ + + def __init__(self,pdbfile,path): + self.pathstr = path + self.path = Path(path) + self.pdbfile = pdbfile + self._init() + + def _init(self): + if not self.path.exists(): + raise PathError('path not exist') + if not isinstance(self.pdbfile,str): + raise TypeError('Pdbid must be a string!') + if ('.pdb' not in self.pdbfile) and (len(self.pdbfile) != 4): + raise TypeError('Pdbid must be 4 letters') + if ('.pdb' or '.PDB') in self.pdbfile: + raise TypeError(f'{self.pdbfile} Remove ".pdb" from input arg, add automatically') + file_list = list(self.path.glob('*.pdb')) + self.path_parent = file_list[0].parent + self.pdbfilelist = [i.name[:4].upper() for i in file_list] + + def __parse_pdb_ligid(self,ion=True): + if self.pdbfile.upper() not in self.pdbfilelist: + raise FileNotFoundError(f'not found {self.pdbfile} in {self.path}') + infos_line = [] + ligId = [] + for i in self.__generate_pdb_lines(): + if self.check_line_header(i) == 'HET': + infos_line.append(i) + ligId = [i.split()[1] for i in infos_line] + ligId = list(set(ligId)) + if not ion: ligId = [i for i in ligId if len(i) == 3 ] # remove ion from list + return ligId + + @property + def ligId(self): + # return ligId include ion + return self.__parse_pdb_ligid() + + @property + def ligIdNoion(self): + return self.__parse_pdb_ligid(ion=False) + + @staticmethod + def check_line_header(line_text): + return line_text[0:6].strip() + + def __generate_pdb_lines(self): + openpdbfile = self.pdbfile + '.pdb' if '.pdb' not in self.pdbfile else self.pdbfile + for row in open(self.path_parent.joinpath(openpdbfile),'r+'): + yield row.strip() + +class covalent(object): + + def __init__(self,pdbfilename,path,pdbid): + self.pdbfilename = pdbfilename + self.pdbid = covalentidentity.cleanpdb(pdbid) + self.path_parent = Path(path) + self.path = self.path_parent.joinpath(pdbfilename) + self._init() + + def _init(self): + self.link_df = covalentidentity.transformtodataframe(path = self.path,first_label='LINK') + + @classmethod + def bypdb(cls,pdbfilename,path,pdbid): + return cls(pdbfilename=pdbfilename,path=path,pdbid=pdbid) + + @staticmethod + def bypymol(pdbfilename,pdbid,path): + return covalentidentity(pdbfilename,pdbid,path) + + @property + def mole_id(self)->list: + _instance=Bdp(path=self.path,sid=self.pdbid) + return [i for i in _instance.mole_id if len(i)==3 and i != 'HOH' and i != 'SO4' and i != 'PO4'] # ! 在这里手动剔除掉硫酸根和磷酸根 + + def modresdata(self)->list: + """modresdata 标准残疾的修饰信息 + """ + modres_lst = Bdp.read_line_list(first_column='MODRES',path=self.path) + lst = [i[12:15] for i in modres_lst] # 切片出修饰残基的小分子 + return list(set(lst)) + + def get_covalent(self): + # 从原始的pdb文件中可获取共价键信息 # ? 对其中的信息正确率为100% 来自原始的pdb文件中的信息 + res1 = self.link_df.query(f'Residue_name1 in {self.remove_modres_moleid}') + res2 = self.link_df.query(f'Residue_name2 in {self.remove_modres_moleid}') + df = pd.merge(res1, res2,how='outer') + df.astype('object') + # remove ions link with ligand + df = df[df['Residue_name1'].str.len()==3] + df = df[df['Residue_name2'].str.len()==3] + return df + + + @property + def remove_modres_moleid(self)->list: + # 清除link信息中的modres,即修饰残基的ligid + # lst = Bdp.read_line_list(first_column='LINK',path=self.path) + # lst = list(set(map(lambda s:s[17:20],lst))) + clean_func = partial(self.func_dynamic_data_template,dynamic_data=self.ModifiedResidues) + res = clean_func(origin_data=self.mole_id) + return list(filter(lambda x:len(x.strip())==3 and x != 'HOH',res)) + + @property + def ModifiedResidues(self): + ModifiedResiduesId = [] + for i in self.mole_id: + if i in self.modresdata(): + ModifiedResiduesId.append(i) + # 从SEQRES序列中获取MODRES的修饰残基的小分子,仅仅从SEQRES有时获取不全面 + seqres_lst = [i[19:].strip() for i in Bdp.read_line_list(first_column='SEQRES',path = self.path)] + seqres_lst_string = '\r\n'.join(seqres_lst) + seqres_lst_set = set(list(i for line in seqres_lst_string.splitlines() for i in line.split())) + seqres_list = seqres_lst_set.intersection(set(self.mole_id)) + ModifiedResiduesId.extend(seqres_list) + return list(set(ModifiedResiduesId)) + + @staticmethod + def func_dynamic_data_template(origin_data:list,dynamic_data:list)->list: + # 取origin_data和dynamic_data补集 + origin_data_set = set(origin_data) + dynamic_data_set = set(dynamic_data) + _data = origin_data_set.difference(dynamic_data_set) + return _data + + + + +cmd.extend('autoshow', autoshow) # pymol中自定义autoshow函数,请使用cmd.autoshow()命令 + +#! 下面为具体使用查找共价键的代码 +# def read_link_lines(item): +# ins1 = covalent.bypdb(pdbfilename = item,path='M:\program\\autotask\search_res1_pdb',pdbid = item[3:7]) +# res = ins1.get_covalent() +# res['Pdb_id'] = [item[3:7] for _ in range(len(res))] +# return res + +def return_mole(series): + # LINK 字段信息进行分析解读 + residue = [i.upper() for i in 'Gly,Ala,Val,Leu,Ile,Pro,Phe,Tyr,Trp,Ser,Thr,Cys,Met,Asn,Gln,Asp,Glu,Lys,Arg,His'.split(',')] + if str(series[1]['Residue_name1']) in residue: + if str(series[1]['Residue_name2']) in residue: + # 除去两个氨基酸连接的情况 + return pd.DataFrame() + else: + _data = series[1]['Residue_name2'],series[1]['Link_distance'],series[1]['Pdb_id'],series[1]['Chain_identifier2'] + _df = pd.DataFrame(data={ + 'LigandName': pd.Series([_data[0]],dtype='string'), + 'Link_distance': pd.Series([_data[1]],dtype='float32'), + 'Pdb_id': pd.Series([_data[2]],dtype='string'), + 'Chain_identifier': pd.Series([_data[3]],dtype='string') + }) + return _df + elif str(series[1]['Residue_name2']) in residue: + if str(series[1]['Residue_name1']) in residue: + # 除去两个氨基酸连接的情况 + return pd.DataFrame() + else: + _data = series[1]['Residue_name1'],series[1]['Link_distance'],series[1]['Pdb_id'],series[1]['Chain_identifier1'] + _df = pd.DataFrame(data={ + 'LigandName': pd.Series([_data[0]],dtype='string'), + 'Link_distance': pd.Series([_data[1]],dtype='float32'), + 'Pdb_id': pd.Series([_data[2]],dtype='string'), + 'Chain_identifier': pd.Series([_data[3]],dtype='string') + }) + return _df + else: + logger.exception( + f'[MoleculeMatchError] Pdbid {series[1]["Pdb_id"]} occur a error\n' + f'出现了两个残基名称都不属于常见的20中\n{residue}\n' + f'source: {series[1]}' + ) + return pd.DataFrame() # 两个小分子连接的情况,去除 + +if __name__ == '__main__': + ... \ No newline at end of file diff --git a/vinautil/vinautil/restore_mol2.py b/vinautil/vinautil/restore_mol2.py new file mode 100644 index 0000000..1e85bb4 --- /dev/null +++ b/vinautil/vinautil/restore_mol2.py @@ -0,0 +1,86 @@ +#!/usr/bin/env python +# -*- encoding: utf-8 -*- +''' +@file : restore_mol2.py +@Description: : restore docked pdbqt to mol2 +@Date :2022/11/17 10:26:36 +@Author :hotwa +@version :1.0 +''' +import os +from pathlib import Path, PurePath +from openbabel import pybel + +def get_coord_dict(fmt, file): + molH = [i for i in pybel.readfile(format = fmt, filename=file.__str__())][0] + molH.OBMol.DeleteHydrogens() + return {atom.idx: atom.coords for atom in molH} + +def pdbqt2mol2(original_mol2_file, undock_pdbqt, docked_pdbqt, out_mol2): + # mind refence https://github.com/ag83/pdbqt-to-mol2/blob/be40bdda20ffb96cd3d173accf77e7a2da9a49aa/convert_to_mol2.py#L15 + # convert autodock vina dock results restore to mol2 format, which include bond infomations + undocked_pdbqt = get_coord_dict('pdbqt', undock_pdbqt) + docked_pdbqt = get_coord_dict('pdbqt', docked_pdbqt) + original_mol2 = get_coord_dict('mol2', original_mol2_file) + assert len(undocked_pdbqt) == len(docked_pdbqt) == len(original_mol2),f'Not equal number of atoms in molecules\n{docked_pdbqt}' + original_coord = {} + for key in original_mol2: + coord_update = [round(x, 3) for x in original_mol2[key]] + coord_update = tuple(coord_update) + original_coord.update({key: coord_update}) + coord_map = {} + for idx, coord in original_coord.items(): + # potential bottleneck for large molecules + for ind, coordinates in undocked_pdbqt.items(): + n=0 + if coord[0] == coordinates[0]: + n=n+1 + if coord[1] == coordinates[1]: + n=n+1 + if coord[2] == coordinates[2]: + n=n+1 + else: + if coord[2] == coordinates[2]: + n=n+1 + else: + if coord[1] == coordinates[1]: + n=n+1 + if coord[2] == coordinates[2]: + n=n+1 + else: + if coord[2] == coordinates[2]: + n=n+1 + if n == 3: + coord_map.update({idx:ind}) + elif n == 2: + if idx in coord_map: + pass + else: + coord_map.update({idx:ind}) + elif n == 1: + if idx in coord_map: + pass + else: + coord_map.update({idx:ind}) + else: + pass + if len(coord_map) == len(original_mol2): + coord_conform = {} + for index1, index2 in coord_map.items(): + coord_conform.update({index1:docked_pdbqt.get(index2)}) + mol2 = pybel.readfile('mol2', original_mol2_file) + mol2 = next(mol2) + mol2.OBMol.DeleteHydrogens() + for atom in mol2: + atom.OBAtom.SetVector(coord_conform.get(atom.idx)[0], coord_conform.get(atom.idx)[1], coord_conform.get(atom.idx)[2]) + mol2.write('mol2', out_mol2, overwrite=True) + else: + print('Lost coordinates in mapping') + +if __name__ == '__main__': + # restore docked_pdbqt to mol2 + docked_pdbqt = f'test_file/test_mgltools_vina112_dock_2xd1/vina112_results/2xd1_A-2XD1_A_CEF/2xd1_A-2XD1_A_CEF_0.pdbqt' + undocked_pdbqt = f'test_file/test_mgltools_vina112_dock_2xd1/pymol_addHs_mgltools_CEF.pdbqt' + original_mol2 = f'test_file/pymol_addHs_CEF.mol2' + out_mol2 = f'test_file/restore_CEF.mol2' + pdbqt2mol2(original_mol2, undocked_pdbqt, docked_pdbqt, out_mol2) \ No newline at end of file diff --git a/vinautil/vinautil/scardock.py b/vinautil/vinautil/scardock.py new file mode 100644 index 0000000..3fc357d --- /dev/null +++ b/vinautil/vinautil/scardock.py @@ -0,0 +1,174 @@ +import argparse +from pathlib import Path +import os,sys +import subprocess +import datetime +from loguru import logger + +from pymol import cmd +from openbabel import pybel +from typing import List +from vinautil.vutils.obabel import PDBQTtoMol2, PDBQTparser +from vinautil.vutils.spyrmsd_load import symmrmsd_mol2_list +from vinautil.vina import Vina +from vinautil.pymolutils.mutagenesis import Mutagenesis_site + +here = Path(__file__).parent.resolve() +conda_prefix = Path(os.environ.get('CONDA_PREFIX')) +python2_interpreter = conda_prefix.joinpath('bin/python2') +python3_interpreter = conda_prefix.joinpath('bin/python3') +prepare_ligand4 = conda_prefix.joinpath('MGLToolsPckgs/AutoDockTools/Utilities24/prepare_ligand4.py') +prepare_receptor4 = conda_prefix.joinpath('MGLToolsPckgs/AutoDockTools/Utilities24/prepare_receptor4.py') +mk_prepare_ligand = conda_prefix.joinpath('bin/mk_prepare_ligand.py') + +def dockvina(receptor:Path, ligand:Path, center:List[float], box_size:List[float], + exhaustiveness:int =32,n_poses:int =20,out_n_poses:int = 20): + out_stem = f'{receptor.stem}--{ligand.stem}' + out_dir = receptor.parent + v = Vina(sf_name='vina') + v.set_receptor(receptor.as_posix()) + v.set_ligand_from_file(ligand.as_posix()) + v.compute_vina_maps(center=center, box_size=box_size) + # Score the current pose + energy = v.score() + print('Score before minimization: %.3f (kcal/mol)' % energy[0]) + # Minimized locally the current pose + energy_minimized = v.optimize() + print('Score after minimization : %.3f (kcal/mol)' % energy_minimized[0]) + v.write_pose(out_dir.joinpath(f'{out_stem}_minimized.pdbqt').as_posix(), overwrite=True) + # Dock the ligand + v.dock(exhaustiveness=exhaustiveness, n_poses=n_poses) + docked_file = out_dir.joinpath(f'{out_stem}.pdbqt') + v.write_poses(docked_file.as_posix(), n_poses=out_n_poses, overwrite=True) + return docked_file + +def SCARdockbase(receptor: Path, ligand: Path, chain: str, site: str): + # clean pdb file + receptor = cleanATOM(receptor.as_posix()) # same pyrosetta.toolbox cleanATOM + if not receptor.exists(): + raise FileNotFoundError(f'{receptor.as_posix()} not found') + # protein Mutagenesis (GLY) + print('Mutagenesis site: ', site) + muta_receptor = receptor.parent.joinpath(f'{receptor.stem}_{site}G.pdb') + Mutagenesis_site(filename=receptor, mutation_type='GLY', site=int(site), outfile= muta_receptor) + # prepare receptor dock file + print('Prepare PDBQT receptor file: ', muta_receptor.name) + receptor_pdbqt = receptor.parent.joinpath(f"{muta_receptor.stem}.pdbqt") + CMD_ = f'{python2_interpreter} {prepare_receptor4} -r {receptor.as_posix()} -o {receptor_pdbqt.as_posix()} -A checkhydrogens' + p = subprocess.Popen(CMD_, shell=True, stdout=subprocess.PIPE) + while p.poll() is None: # progress still runing + subprocess_read_res = p.stdout.read().decode('utf-8') + logger.info(f'''Task record : {datetime.datetime.now()}:\n {subprocess_read_res}''') + # prepare ligand dock file(mol2 file) + ligand = Path(ligand) + ligand_pdbqt = ligand.parent.joinpath(f"{ligand.stem}.pdbqt") + if not ligand.exists(): + raise FileNotFoundError(f'{ligand} not found') + # use openbabel add polar hydrogens + print('Add polar hydrogens(Openbabel): ', ligand.name) + molH = pybel.readfile('mol2', ligand.as_posix()) + molH = next(molH) + molH.OBMol.DeleteHydrogens() + molH.OBMol.AddPolarHydrogens() + molH.write('mol2', ligand.as_posix(), overwrite=True) + # use meeko backend prepare ligand, optional MGLtools prepare_ligand4.py + print('Prepare PDBQT ligand file: ', ligand.name) + CMD_ = f'{python3_interpreter} {mk_prepare_ligand} -i {ligand.as_posix()} -o {ligand_pdbqt.as_posix()}' + p = subprocess.Popen(CMD_, shell=True, stdout=subprocess.PIPE) + while p.poll() is None: + subprocess_read_res = p.stdout.read().decode('utf-8') + logger.info(f'''Task record : {datetime.datetime.now()}:\n {subprocess_read_res}''') + # box center, covalent beta carbon coordinate + cmd.reinitialize('everything') + cmd.load(receptor.as_posix()) + coordinates = [] + cmd.select('res_covalent_atoms', f'chain {chain} and resi {site} and name CB') # select residue covalent site beta carbon, be careful GLY have no side chain with beta carbon + cmd.iterate_state('0', 'res_covalent_atoms', 'coordinates.append([x,y,z])', space=locals()) + if len(coordinates) == 1: + center = coordinates[0] + print('Covalent residue beta carbon coordinate(center): ', center) + else: + raise ValueError(f'covalent beta carbon site {site} not found!') + # SCARdock docking + # ! be careful, orginal molecule coordinate should in docking box + print("The molecular coordinates of the input mol2 file should be within a range of 40 angstroms, with the covalent residue's beta carbon atom as the center of the extended box.") + docked_file = dockvina(receptor=receptor_pdbqt, ligand=ligand_pdbqt, center=center, box_size=[40, 40, 40], exhaustiveness=32,n_poses=20,out_n_poses = 20) + # restore mol2 + print('Restore mol2 file: ', ligand.name) + ins = PDBQTtoMol2(ligand.read_text(),ligand_pdbqt.read_text(),PDBQTparser(docked_file).get_modules()) + res_mol2 = ins.to_string() # mol2 string list + # write mol2 + for n,m in enumerate(res_mol2): + mol2_file = ligand.parent.joinpath(f'{ligand.stem}+{str(n+1).zfill(3)}.mol2') + mol2_file.write_text(m) + # cal RMSD(spyrmsd) + print('Calculate RMSD(spyrmsd): ') + res = symmrmsd_mol2_list(mol2_docked=res_mol2, mol2_ref=ligand.read_text()) + res_list = ['{:.2f}'.format(i) for i in res] + print(f''' +SCARdock docking result: {docked_file} +RMSD: {res_list}''') + +def SCARdock(): + # cmd lineparser + parser = argparse.ArgumentParser(description='SCARdock Docking') + parser.add_argument('-r', '--receptor', metavar="recepotr file", nargs='?', default=sys.stdin, help='recepotr file, support pdb') + parser.add_argument('-l', '--ligand', metavar="ligand file", nargs='?', default=sys.stdin, help='ligand file, molecule file (MOL2, SDF,...)(use meeko prepare)') + parser.add_argument('-s', '--site', metavar="residue covalent site", nargs='?', default=sys.stdin, help='residue covalent site') + parser.add_argument('-c', '--chain', metavar="covalent chain ID", nargs='?', default=sys.stdin, help='covalent chain ID') + parser.add_argument('-log', '--log_dir', metavar="output log directory", nargs='?', default='./', help='Relative Path') + args = parser.parse_args() + # 使用logger.add()方法,指定日志文件的路径和名称,以及编码方式 + log_file = Path(args.log_dir) / f'scardock.log' + logger.add(log_file.as_posix(), encoding='utf-8') + logger.info(f'SCARdock docking...') + # command line SCARdock + print('''SCARdock method DOI: 10.1021/acs.jcim.6b00334 +SCARdock screening server(https://scardock.com) DOI: 10.1021/acsomega.2c08147 +lab site: http://liugroup.site +author: Lingyu Zeng mail: pylyzeng@gmail.com +''') + SCARdockbase(receptor=Path(args.receptor), ligand=Path(args.ligand), chain=args.chain, site=str(args.site)) + +def SCARdocktest(): + # Run SCARdock with predefined test parameters + test_dir = here / 'test' + receptor = test_dir.joinpath('4i24.pdb').as_posix() + ligand = test_dir.joinpath('4i24_C_1C9_babel_addh.mol2').as_posix() + site = 797 + chain = 'A' + SCARdockbase(receptor=Path(receptor), ligand=Path(ligand), chain=chain, site=str(site)) + +def cleanATOM(pdb_file, out_file=None, ext="_clean.pdb")->Path: + """Extract all ATOM and TER records in a PDB file and write them to a new file. + + Args: + pdb_file (str): Path of the PDB file from which ATOM and TER records + will be extracted + out_file (str): Optional argument to specify a particular output filename. + Defaults to .clean.pdb. + ext (str): File extension to use for output file. Defaults to ".clean.pdb" + """ + # find all ATOM and TER lines + with open(pdb_file, "r") as fid: + good = [l for l in fid if l.startswith(("ATOM", "TER"))] + + # default output file to _clean.pdb + if out_file is None: + out_file = os.path.splitext(pdb_file)[0] + ext + + # write the selected records to a new file + with open(out_file, "w") as fid: + fid.writelines(good) + return Path(out_file) + + + + + + + + + + + diff --git a/vinautil/vinautil/untitled.txt b/vinautil/vinautil/untitled.txt new file mode 100644 index 0000000..eef276b --- /dev/null +++ b/vinautil/vinautil/untitled.txt @@ -0,0 +1,17 @@ + - bioconda::mgltools + - conda-forge::spyrmsd + - conda-forge::pandas + - conda-forge::openbabel + - conda-forge::rdkit + - conda-forge::pymol-open-source + - conda-forge::loguru + - conda-forge::swig + - conda-forge::boost-cpp + - conda-forge::sphinx + - conda-forge::sphinx_rtd_theme + - conda-forge::vina + - conda-forge::ipython + - conda-forge::biopython + - conda-forge::prody # meeko dependecy (optionally, for covalent docking) + +conda install mgltools spyrmsd pandas openbabel rdkit pymol-open-source loguru numpy swig boost-cpp sphinx sphinx_rtd_theme vina ipython scipy biopython prody \ No newline at end of file diff --git a/vinautil/vinautil/vina.py b/vinautil/vinautil/vina.py new file mode 100644 index 0000000..3f311f3 --- /dev/null +++ b/vinautil/vinautil/vina.py @@ -0,0 +1,464 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# Vina +# + +import os +import glob +import stat +import sys + +import numpy as np + +from vina.vina_wrapper import Vina as _Vina +from vina import utils + + +class Vina: + def __init__(self, sf_name='vina', cpu=0, seed=0, no_refine=False, verbosity=1): + """Initialize a Vina object. + + Args: + sf_name (str): Scoring function name to use (Vina or ad4) (default: vina) + cpu (int): Number of CPU to use (default: 0; use all of them) + seed (int): Random seed (default: 0; ramdomly choosed) + no_refine (boolean): when receptor is provided, do not use explicit receptor atoms + (instead of precalculated grids) for: (1) local optimization and scoring after docking, + (2) --local_only jobs, and (3) --score_only jobs (default: False) + verbosity (int): verbosity 0: not output, 1: normal, 2: verbose (default: 1; some output) + + """ + sf_name = sf_name.lower() + if not sf_name in ('vina', 'vinardo', 'ad4'): + raise ValueError('Error: Scoring function %s not recognized. (only vina, vinardo or ad4)' % sf_name) + + self._vina = _Vina(sf_name, cpu, seed, verbosity, no_refine) + + self._sf_name = sf_name + if sf_name == 'vina': + self._weights = (-0.035579, -0.005156, 0.840245, -0.035069, -0.587439, 50, 0.05846) + elif sf_name == 'vinardo': + self._weights = (-0.045, 0.8, -0.035, -0.6, 50, 0.05846) + else: + self._weights = (0.1662, 0.1209, 0.1406, 0.1322, 50) + self._rigid_receptor = None + self._flex_receptor = None + self._ligands = None + self._center = None + self._box_size = None + self._spacing = None + self._even_nelements = True + if sf_name in ('vina', 'vinardo'): + self._no_refine = no_refine + else: + self._no_refine = False + self._seed = self._vina.seed() + + def __str__(self): + """Print basic information about Docking configuration (rigid receptor, flex receptor, + ligands, scoring function, weights, no_refine, box center, box dimensions, + box spacing, box even nelements, seed). + + """ + info = "Receptor (rigid): %s\n" % self._rigid_receptor + info += "Receptor (flex): %s\n" % self._flex_receptor + + if isinstance(self._ligands, (list, tuple)): + info += "Ligands: %s\n" % ", ".join(self._ligands) + else: + info += "Ligand: %s\n" % self._ligands + + info += "Scoring function: %s\n" % self._sf_name + info += "Weights: %s\n" % " ".join(["%.6f" % i for i in self._weights]) + info += "No refinement with receptor atoms: %s\n" % self._no_refine + + if self._center is not None: + info += "Box center: %s\n" % " ".join(["%.3f" % i for i in self._center]) + info += "Box dimensions: %s\n" % " ".join(["%.2f" % i for i in self._box_size]) + info += "Box spacing: %.3f\n" % self._spacing + info += "Box even NELEMENTS: %s\n" % self._even_nelements + + info += "Seed: %s" % self._seed + + return info + + def cite(self): + """Print citation message.""" + self._vina.cite() + + def info(self): + """Return information about the docking configuration. + + Returns: + dict (str): Dictionary of information about the docking configuration. + + Information: + rigid_receptor (str), + flex_receptor (str), + ligands (list), + scoring_function (str), + weights (tuple), + no_refine (bool), + box_center (list), + box_size (list), + box_spacing (float), + box_even_elements (bool), + seed (int) + + """ + info = {} + + info['rigid_receptor'] = self._rigid_receptor + info['flex_receptor'] = self._flex_receptor + info['ligands'] = self._ligands + info['scoring_function'] = self._sf_name + info['weights'] = self._weights + info['no_refine'] = self._no_refine + info['box_center'] = self._center + info['box_size'] = self._box_size + info['box_spacing'] = self._spacing + info['box_even_elements'] = self._even_nelements + info['seed'] = self._seed + + return info + + def set_receptor(self, rigid_pdbqt_filename=None, flex_pdbqt_filename=None): + """Set receptor. + + Args: + rigid_pdbqt_filename (str): rigid pdbqt receptor filename (default: None) + flex_pdbqt_filename (str): flexible residues pdbqt filename (default: None) + + """ + if rigid_pdbqt_filename is None and flex_pdbqt_filename is None: + raise ValueError('Error: No (rigid) receptor or flexible residues were specified') + + # For the rigid part of the receptor + if rigid_pdbqt_filename is not None: + if not os.path.exists(rigid_pdbqt_filename): + raise RuntimeError('Error: file %s does not exist.' % rigid_pdbqt_filename) + _, extension = os.path.splitext(rigid_pdbqt_filename) + if not extension == '.pdbqt': + raise TypeError('Error: Vina requires a PDBQT file for the (rigid) receptor.') + + # For the flex part of the receptor + if flex_pdbqt_filename is not None: + if not os.path.exists(flex_pdbqt_filename): + raise RuntimeError('Error: file %s does not exist.' % flex_pdbqt_filename) + _, extension = os.path.splitext(flex_pdbqt_filename) + if not extension == '.pdbqt': + raise TypeError('Error: Vina requires a PDBQT file for the (flex) receptor.') + + if rigid_pdbqt_filename is not None: + if flex_pdbqt_filename is not None: + self._vina.set_receptor(rigid_pdbqt_filename, flex_pdbqt_filename) + else: + self._vina.set_receptor(rigid_pdbqt_filename) + else: + self._vina.set_receptor('', flex_pdbqt_filename) + + self._rigid_receptor = rigid_pdbqt_filename + self._flex_receptor = flex_pdbqt_filename + + def set_ligand_from_file(self, pdbqt_filename): + """Set ligand(s) from a file. The chemical file format must be PDBQT. + + Args: + pdbqt_filename (str or list): Name or list of PDBQT filename(s) + + """ + if not isinstance(pdbqt_filename, (list, tuple)): + pdbqt_filename = [pdbqt_filename] + + for pf in pdbqt_filename: + if not os.path.exists(pf): + raise RuntimeError('Error: file %s does not exist.' % pf) + _, extension = os.path.splitext(pf) + if not extension == '.pdbqt': + raise TypeError('Error: Vina requires a PDBQT file for the ligand.') + + if len(pdbqt_filename) == 1: + self._vina.set_ligand_from_file(pdbqt_filename[0]) + else: + self._vina.set_ligand_from_file(pdbqt_filename) + + self._ligands = pdbqt_filename + + def set_ligand_from_string(self, pdbqt_string): + """Set ligand(s) from a string. The chemical file format must be PDBQT. + + Args: + pdbqt_string (str or list): string or list of PDBQT strings + + """ + if not isinstance(pdbqt_string, (list, tuple)): + pdbqt_string = [pdbqt_string] + + for ps in pdbqt_string: + if not isinstance(ps, str): + raise TypeError('Error: %s is not a string.' % ps) + + if len(pdbqt_string) == 1: + self._vina.set_ligand_from_string(pdbqt_string[0]) + else: + self._vina.set_ligand_from_string(pdbqt_string) + + self._ligands = pdbqt_string + + def set_weights(self, weights): + """Set potential weights for vina or ad4 scoring function. + + Args: + weights (list): list or weights + + """ + if not isinstance(weights, (list, tuple)): + raise TypeError('Error: Cannot set weights (%s).' % weights) + if self._sf_name == 'vina': + if len(weights) != 7: + raise ValueError('Error: Number of weights does not correspond to Vina scoring function.' ) + self._vina.set_vina_weights(*weights) + else: + if len(weights) != 6: + raise ValueError('Error: Number of weights does not correspond to AD4 or Vinardo scoring function.') + if self._sf_name == 'ad4': + self._vina.set_ad4_weights(*weights) + else: + self._vina.set_vinardo_weights(*weights) + + self._weights = weights + + def compute_vina_maps(self, center, box_size, spacing=0.375, force_even_voxels=False): + """Compute affinity maps using Vina scoring function. + + Args: + center (list): center position + box_size (list): size of the box in Angstrom + spacing (float): grid spacing (default: 0.375) + force_even_voxels (boolean): Force the number of voxels (NPTS/NELEMENTS) to be an even number + (and forcing the number of grid points to be odd) (default: False) + + """ + if len(center) != 3: + raise ValueError('Error: center of the box needs to be defined by (x, y, z) in Angstrom.') + elif len(box_size) != 3: + raise ValueError('Error: box size needs to be defined by (a, b, c) in Angstrom.') + elif not all([i > 0 for i in box_size]): + raise ValueError('Error: box dimensions are required to be positive.') + elif spacing <= 0: + raise ValueError('Error: spacing should be greater than zero.') + + x, y, z = center + a, b, c = box_size + + self._vina.compute_vina_maps(x, y, z, a, b, c, spacing, force_even_voxels) + + self._center = center + self._box_size = box_size + self._spacing = spacing + self._voxels = np.ceil(np.array(box_size) / self._spacing).astype(np.int64) + + # Necessary step to know if we can write maps or not later + if force_even_voxels: + self._voxels[0] += int(self._voxels[0] % 2 == 1) + self._voxels[1] += int(self._voxels[1] % 2 == 1) + self._voxels[2] += int(self._voxels[2] % 2 == 1) + + def load_maps(self, map_prefix_filename): + """Load vina or ad4 affinity maps. + + Args: + map_prefix_filename (str): affinity map prefix filename + + """ + if not glob.glob('%s.*.map' % map_prefix_filename): + raise RuntimeError('Error: Cannot find affinity maps with %s' % map_prefix_filename) + + self._vina.load_maps(map_prefix_filename) + + def write_maps(self, map_prefix_filename='receptor', gpf_filename='NULL', + fld_filename='NULL', receptor_filename='NULL', overwrite=False): + """Write affinity maps. + + Args: + map_prefix_filename (str): affinity map pathname (path directory + prefix) + gpf_filename (str): grid protein filename (default: NULL) + fld_filename (str): fld filename (default: NULL) + receptor filename (str): receptor filename (default: NULL) + overwrite (bool): allow overwriting (default: false) + + """ + if self._center is None: + raise RuntimeError('Error: no affinity maps were defined.') + elif not overwrite: + existing_maps = glob.glob('%s.*.map' % map_prefix_filename) + if existing_maps: + raise RuntimeError('Error: Cannot overwrite existing affinity maps (%s)' % existing_maps) + + if any(self._voxels % 2 == 1): + error_msg = 'Error: Can\'t write maps. Number of voxels (NPTS/NELEMENTS) is odd: %s.' % self._voxels + error_msg += ' Set \'force_even_voxels\' to True when computing vina maps.' + raise RuntimeError(error_msg) + + self._vina.write_maps(map_prefix_filename, gpf_filename, fld_filename, receptor_filename) + + def write_pose(self, pdbqt_filename, remarks='', overwrite=False): + """Write pose (after randomize or optimize). + + Args: + pdbqt_filename (str): output PDBQT filename + remarks (str): REMARKS to add in the PDBQT filename + overwrite (bool): allow overwriting (default: false) + + """ + if not utils.check_file_writable(pdbqt_filename): + raise RuntimeError('Error: Cannot write pose at %s.' % pdbqt_filename) + elif not overwrite: + if os.path.exists(pdbqt_filename): + raise RuntimeError('Error: Cannot overwrite %s, already exists.' % pdbqt_filename) + + self._vina.write_pose(pdbqt_filename, remarks) + + def write_poses(self, pdbqt_filename, n_poses=9, energy_range=3.0, overwrite=False): + """Write poses from docking. + + Args: + pdbqt_filename (str): PDBQT file containing poses found + n_pose (int): number of poses to write (default: 9) + energy_range (float): maximum energy difference from best pose (default: 3.0 kcal/mol) + overwrite (bool): allow overwriting (default: false) + + """ + if not utils.check_file_writable(pdbqt_filename): + raise RuntimeError( + 'Error: Cannot write docking results at %s.' % pdbqt_filename) + elif not overwrite: + if os.path.exists(pdbqt_filename): + raise RuntimeError( + 'Error: Cannot overwrite %s, already exists.' % pdbqt_filename) + elif n_poses <= 0: + raise ValueError( + 'Error: number of poses written must be greater than zero.') + elif energy_range <= 0: + raise ValueError('Error: energy range must be greater than zero.') + + self._vina.write_poses(pdbqt_filename, n_poses, energy_range) + + def poses(self, n_poses=9, energy_range=3.0, coordinates_only=False): + """Get poses from docking. + + Args: + n_pose (int): number of poses to retrieve (default: 9) + energy_range (float): maximum energy difference from best pose (default: 3.0 kcal/mol) + coordinates_only (bool): return coordinates for each pose only + + Returns: + str/ndarray: PDBQT file string or Array of coordinates of poses if coordinates_only=True + + """ + if n_poses <= 0: + raise ValueError('Error: number of poses written must be greater than zero.') + elif energy_range <= 0: + raise ValueError('Error: energy range must be greater than zero.') + + if coordinates_only: + # Dirty hack to get the coordinates from C++, it is not advised to have vector>> + coordinates = np.array(self._vina.get_poses_coordinates(n_poses, energy_range)) + coordinates = coordinates.reshape((coordinates.shape[0], coordinates.shape[1] // 3, 3)) + coordinates = np.around(coordinates, decimals=3) + return coordinates + else: + return self._vina.get_poses(n_poses, energy_range) + + def energies(self, n_poses=9, energy_range=3.0): + """Get pose energies from docking. + + Args: + n_pose (int): number of poses to retrieve (default: 9) + energy_range (float): maximum energy difference from best pose (default: 3.0 kcal/mol) + + Returns: + ndarray: Array of energies from each pose (rows=poses, columns=energies) + + Vina/Vinardo FF: + columns=[total, inter, intra, torsions, intra best pose] + + AutoDock FF: + columns=[total, inter, intra, torsions, -intra] + + """ + if n_poses <= 0: + raise ValueError('Error: number of poses written must be greater than zero.') + elif energy_range <= 0: + raise ValueError('Error: energy range must be greater than zero.') + + return np.around(self._vina.get_poses_energies(n_poses, energy_range), decimals=3) + + def randomize(self): + """Randomize the input ligand conformation.""" + self._vina.randomize() + + def score(self): + """Score current pose. + + Returns: + nadarray: Array of energies from current pose. + + Vina/Vinardo FF: + columns=[total, lig_inter, flex_inter, other_inter, flex_intra, lig_intra, torsions, lig_intra best pose] + + AutoDock FF: + columns=[total, lig_inter, flex_inter, other_inter, flex_intra, lig_intra, torsions, -lig_intra] + + """ + # It does not make sense to report energies with a precision higher than 3 + # since the coordinates precision is 3. + energies = np.around(self._vina.score(), decimals=3) + return energies + + def optimize(self, max_steps=0): + """Quick local BFGS energy optimization. + + Args: + max_steps (int): Maximum number of local minimization steps (default: 0). When max_steps is equal to 0, + the maximum number of steps will be equal to (25 + num_movable_atoms) / 3). + + Returns: + nadarray: Array of energies from optimized pose. + + Vina/Vinardo FF: + columns=[total, lig_inter, flex_inter, other_inter, flex_intra, lig_intra, torsions, lig_intra best pose] + + AutoDock FF: + columns=[total, lig_inter, flex_inter, other_inter, flex_intra, lig_intra, torsions, -lig_intra] + + """ + if max_steps < 0: + raise ValueError('Error: max_steps cannot be negative.') + + # It does not make sense to report energies with a precision higher than 3 + # since the coordinates precision is 3. + energies = np.around(self._vina.optimize(max_steps), decimals=3) + return energies + + def dock(self, exhaustiveness=8, n_poses=20, min_rmsd=1.0, max_evals=0): + """Docking: global search optimization. + + Args: + exhaustiveness (int): Number of MC run (default: 8) + n_poses (int): number of pose to generate (default: 20) + min_rmsd (float): minimum RMSD difference between poses (default: 1.0 Ansgtrom) + max_evals (int): Maximum number of evaluation (default: 0; use heuristics rules) + + """ + if exhaustiveness <= 0: + raise ValueError('Error: exhaustiveness must be 1 or greater.') + elif n_poses <= 0: + raise ValueError('Error: number of poses to generate must be greater than zero.') + elif min_rmsd <= 0: + raise ValueError('Error: minimal RMSD must be greater than zero.') + elif max_evals < 0: + raise ValueError('Error: maximum evaluations must be positive.') + + self._vina.global_search(exhaustiveness, n_poses, min_rmsd, max_evals) diff --git a/vinautil/vinautil/vinaconfig.py b/vinautil/vinautil/vinaconfig.py new file mode 100644 index 0000000..9fde017 --- /dev/null +++ b/vinautil/vinautil/vinaconfig.py @@ -0,0 +1,150 @@ +#!/usr/bin/env python +# -*- encoding: utf-8 -*- +''' +@file :vinaconfig.py +@Description: : +@Date :2022/11/11 15:17:33 +@Author :hotwa +@version :1.0 +''' +from pathlib import Path, PurePath +from dataclasses import dataclass +from typing import Iterable +from collections.abc import Iterable as _Iterator + + +from vinautil.utils.typecheck import typeassert + + +@typeassert(x=float, y=float, z=float) +@dataclass +class xyz_point: + __slots__ = ['x', 'y', 'z', '__dict__'] + x: float + y: float + z: float + + def __init__(self, x, y, z): + self.x = float(x) + self.y = float(y) + self.z = float(z) + + def __repr__(self): + return 'xyz point: {},{},{}'.format(self.x, self.y, self.z) + + @classmethod + def from_iterable(cls, iterable_object: Iterable): + if isinstance(iterable_object, _Iterator): + return cls(*iterable_object) + +@typeassert(receptor=PurePath, + ligand=PurePath, + box_size=xyz_point, + center=xyz_point, + outpdbqtfile=PurePath, + exhaustiveness=int, + num_modes=int, + energy_range=int) +@dataclass +class vinaconfig: + receptor: PurePath + ligand: PurePath + box_size: xyz_point + center: xyz_point + outpdbqtfile: PurePath + exhaustiveness: int + num_modes: int + energy_range: int + + __slots__ = ['receptor', 'ligand', 'center', + 'box_size', 'outpdbqtfile', '__dict__', + 'exhaustiveness', 'num_modes', 'energy_range'] + + def __init__(self, + receptor, ligand, + center, + box_size, + outpdbqtfile, + exhaustiveness = 32, + num_modes = 20, + energy_range = 5, + ): + self.receptor = receptor + self.ligand = ligand + self.center = center if isinstance(center, xyz_point) else xyz_point.from_iterable(center) + self.box_size = box_size if isinstance(box_size, xyz_point) else xyz_point.from_iterable(box_size) + self.outpdbqtfile = outpdbqtfile + self.exhaustiveness = exhaustiveness + self.num_modes = num_modes + self.energy_range = energy_range + + def to_dict(self )->dict: + return {'receptor': self.receptor, + 'ligand': self.ligand, + 'center': self.center, + 'box_size': self.box_size, + 'outpdbqtfile': self.outpdbqtfile, + 'exhaustiveness': self.exhaustiveness, + 'num_modes': self.num_modes, + 'energy_range': self.energy_range} + + def to_txt(self, file :Path, + cx = None, + cy = None, + cz = None, + sx = None, + sy = None, + sz = None, + exhaustiveness = None, + num_modes = None, + energy_range = None, + absolute_path=False) -> None: + """to_txt save config to file supported autodock vina 1.1.2 and 1.2.3 binary file + + use: vina --config config.txt + + Arguments: + file {file path} -- out put file + """ + if absolute_path: + receptor_path = self.receptor.absolute().as_posix() + ligand_path = self.ligand.absolute().as_posix() + outpdbqtfile_path = self.outpdbqtfile.absolute().as_posix() + else: + receptor_path = self.receptor.as_posix() + ligand_path = self.ligand.as_posix() + outpdbqtfile_path = self.outpdbqtfile.as_posix() + exhaustiveness = exhaustiveness if exhaustiveness else self.exhaustiveness + num_modes = num_modes if num_modes else self.num_modes + energy_range = energy_range if energy_range else self.energy_range + cx = cx if cx else self.center.x + cy = cy if cy else self.center.y + cz = cz if cz else self.center.z + sx = sx if sx else self.box_size.x + sy = sy if sy else self.box_size.y + sz = sz if sz else self.box_size.z + content = f''' +receptor = {receptor_path} +ligand = {ligand_path} + +center_x = {cx} +center_y = {cy} +center_z = {cz} + +size_x = {sx} +size_y = {sy} +size_z = {sz} + + +exhaustiveness = {exhaustiveness} + +num_modes = {num_modes} + +energy_range = {energy_range} + +out = {outpdbqtfile_path} +''' + file, file_dir = Path(file), Path(file).parent + if not file_dir.exists(): file_dir.mkdir(parents=True) + with open(file.absolute().as_posix(), 'w', encoding='utf-8') as f: + f.write(content) \ No newline at end of file diff --git a/vinautil/vinautil/vutils/__init__.py b/vinautil/vinautil/vutils/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/vinautil/vinautil/vutils/cifparse.py b/vinautil/vinautil/vutils/cifparse.py new file mode 100644 index 0000000..a22a40e --- /dev/null +++ b/vinautil/vinautil/vutils/cifparse.py @@ -0,0 +1,116 @@ +from pathlib import Path +import gzip +from dataclasses import dataclass, field +from typing import List, Tuple, Dict, Union, Optional, NoReturn, Generator, Iterable, Callable, Any +from Bio.PDB.MMCIF2Dict import MMCIF2Dict as mmcifdict +from pdbecif.mmcif_tools import MMCIF2Dict + +try: + from .typecheck import typeassert + from .pdbparse_module import Bdp +except (ImportError, ModuleNotFoundError, AttributeError, NameError, SyntaxError): + from typecheck import typeassert + from pdbparse_module import Bdp + +@typeassert(file = Path) +@dataclass +class cif_parse(): + """ + parse cif file + """ + file: Path + pid: str = field(default_factory=lambda: None) + file_content: str = field(init=False) + cif_dict: dict = field(init=False) + bio_cif_dict: Dict = field(init=False) + pdbecif_dict: Dict = field(init=False) + + def __post_init__(self): + if self.file.suffix == '.gz': + self.file_content = self.__get_cif_gz_content() + self.pdbecif_dict = MMCIF2Dict().parse(self.file.as_posix()) + self.bio_cif_dict = mmcifdict(self.file.as_posix()) + self.cif_dict = self.pdbecif_dict + if self.pid is None: + self.pid = self.__get_pid() + + def __get_cif_gz_content(self) -> str: + if '.gz' in self.file.suffixes: + with gzip.open(self.file.as_posix(), 'rt') as f: + content = f.read() + else: + raise ValueError(f"{self.file.as_posix()} is not a gzip file, or not suffix with '.gz'") + if '.cif' in Path(self.file.stem).suffix: + rename = self.file.stem + else: + rename = self.file.name.split('.')[0] + '.cif' + ungzip_file = self.file.parent.joinpath(rename).as_posix() + with open(ungzip_file, 'w', encoding='utf-8') as f: + f.write(content) + self.file = Path(ungzip_file) + return content + + def search_keylike(self, key: str) -> List[str]: + return [{i:self.cif_dict[i]} for i in self.cif_dict.keys() if key in i] + + def __get_pid(self) -> str: + if len(self.bio_cif_dict['_entry.id']) == 1: + return self.bio_cif_dict['_entry.id'][0] + else: + raise ValueError(f"More than one PDB ID in {self.file.as_posix()}") + + @property + def conn_infos(self) -> Dict: + return self.cif_dict[self.pid]['_struct_conn'] + + @property + def auth_comp_id(self) -> List[str]: + return list(set(self.bio_cif_dict['_struct_site.pdbx_auth_comp_id'])) + + @property + def covalent_bonds(self) -> bool: + if 'covale' in self.bio_cif_dict['_struct_conn.conn_type_id']: + return True + else: + return False + + + + + + + + +""" +_pdbx_validate_symm_contact.id +_pdbx_validate_torsion.auth_asym_id +_pdbx_struct_assembly_gen.asym_id_list +_struct_conn.conn_type_id +_struct_site.pdbx_auth_asym_id +_struct_site.pdbx_auth_comp_id +_struct_site.details +_struct_conn.conn_type_id字段还有许多其他连接类型,包括: + +covale: 共价键 +disulf: 二硫键 +covale1: 共价键1 +covale2: 共价键2 +covale3: 共价键3 +hbond: 电子转移键 +metalc: 金属配位键 +metalc1: 金属配位键1 +metalc2: 金属配位键2 + +disulf: 双硫键连接。 +covale: 共价连接。 +metalc: 金属连接。 +saltbr: 离子之间的相互作用。 +hydrogen: 氢键连接。 +water: 水分子之间的连接。 +helix: 螺旋结构。 +turn: 旋转结构。 +cis: 同侧螺旋结构。 +trans: 异侧螺旋结构。 +xlink: 蛋白质间的交联。 +covalent: 共价连接。 +""" \ No newline at end of file diff --git a/vinautil/vinautil/vutils/comm.py b/vinautil/vinautil/vutils/comm.py new file mode 100644 index 0000000..5cba62b --- /dev/null +++ b/vinautil/vinautil/vutils/comm.py @@ -0,0 +1,45 @@ +#!/usr/bin/env python +# -*- encoding: utf-8 -*- +''' +@file :comm +@Description: : +@Date :2023/1/2 21:26:21 +@Author :lyzeng +@mail :pylyzeng@gmail.com +@version :1.0 +''' +from json import dumps +import pickle +from pathlib import Path +import os + +def piksave(obj:object,filename:str): + with open(filename, 'wb') as f: + pickle.dump(obj,f) + +def pikload(filename): + with open(filename, 'rb') as f: + return pickle.load(f) +def bejson(d:dict(help='need to beauty json')) -> dict: + return dumps(d,indent=4,ensure_ascii=False) + + + +def find_file(env_var, keyword): + # 获取环境变量中的所有路径 + paths = os.environ.get(env_var).split(os.pathsep) + # 存储匹配的文件 + matches = [] + # 遍历每个路径 + for path in paths: + # 判断路径是否存在 + if Path(path).exists(): + # 使用glob或rglob方法匹配包含关键字的文件,并将结果添加到匹配列表中(注意转义特殊字符) + matches.extend(Path(path).glob(f"*{keyword}*")) + # 判断匹配列表的长度 + if len(matches) == 0: # 没有找到任何文件,返回None + return None + elif len(matches) == 1: # 只找到一个文件,返回Path对象(取第一个元素) + return matches[0] + else: # 找到多个文件,抛出异常,并显示匹配列表中的所有文件名和路径信息 + raise Exception(f"Multiple files found: {matches}") \ No newline at end of file diff --git a/vinautil/vinautil/vutils/config.py b/vinautil/vinautil/vutils/config.py new file mode 100644 index 0000000..ef44962 --- /dev/null +++ b/vinautil/vinautil/vutils/config.py @@ -0,0 +1,193 @@ +#!/usr/bin/env python +# -*- encoding: utf-8 -*- +''' +@file :configutils.py +@Description: : generate config file for autodock vina +@Date :2022/12/28 21:58:08 +@Author :hotwa +@version :1.1 +''' +import sys +from pathlib import Path +here = Path(__file__).parent.resolve() +from dataclasses import dataclass, field +from typing import Iterable +from openbabel import pybel +try: + from .typecheck import typeassert +except ImportError: + from vutils.typecheck import typeassert + +@typeassert(file=Path, fmt=str) +class molecule: + file: Path + + def __init__(self,file: Path, fmt:str=None): + self.file = file + self.fmt = fmt if bool(fmt) else file.suffix.replace('.', '') + + def get_coord_lst(self, addHs:bool = True): + molH = pybel.readfile(format = self.fmt, filename=self.file.as_posix()) + molH = next(molH) + # molH.OBMol.DeleteHydrogens() + if addHs: molH.OBMol.AddPolarHydrogens() + return [list(atom.coords) for atom in molH] + + def get_coord_dict(self, addHs:bool = True): + molH = pybel.readfile(format = self.fmt, filename=self.file.as_posix()) + molH = next(molH) + if addHs: molH.OBMol.AddPolarHydrogens() + return {atom.idx: atom.coords for atom in molH} + + def compute_box(self, extending = 10): + coords_lst = self.get_coord_lst() + xlst, ylst, zlst = [i[0] for i in coords_lst], [i[1] for i in coords_lst], [i[2] for i in coords_lst] + ([minX, minY, minZ], [maxX, maxY, maxZ]) = ([min(xlst), min(ylst), min(zlst)], [max(xlst), max(ylst), max(zlst)]) + minX = minX - float(extending) + minY = minY - float(extending) + minZ = minZ - float(extending) + maxX = maxX + float(extending) + maxY = maxY + float(extending) + maxZ = maxZ + float(extending) + SizeX = maxX - minX + SizeY = maxY - minY + SizeZ = maxZ - minZ + CenterX = (maxX + minX) / 2 + CenterY = (maxY + minY) / 2 + CenterZ = (maxZ + minZ) / 2 + c = { + 'x': float("{:.2f}".format(CenterX)), + 'y': float("{:.2f}".format(CenterY)), + 'z': float("{:.2f}".format(CenterZ)), + } + s = { + 'x': float("{:.2f}".format(SizeX)), + 'y': float("{:.2f}".format(SizeY)), + 'z': float("{:.2f}".format(SizeZ)) + } + rd = { + 'center': xyz_point(**c), + 'box_size': xyz_point(**s) + } + return rd + +@typeassert(x=(float, int), y=(float, int), z=(float, int)) +@dataclass +class xyz_point: + x: (float, int) + y: (float, int) + z: (float, int) + + def __repr__(self): + return f"xyz_point(x={self.x}, y={self.y}, z={self.z})" + + def __str__(self): + return f"three demensional point: x={self.x}, y={self.y}, z={self.z}" + + @classmethod + def from_iterable(cls,iterable_object:Iterable): + if isinstance(iterable_object,Iterable): + return cls(*iterable_object) + else: + raise TypeError('iterable_object must be Iterable') + +@typeassert(receptor=Path, + ligand=Path, + box_size=(xyz_point, type(None)), + center=(xyz_point, type(None)), + outpdbqtfile=Path, + exhaustiveness=int, + num_modes=int, + energy_range=int) +@dataclass +class vinaconfig: + receptor: Path + ligand: Path + outpdbqtfile: Path + center: (xyz_point, type(None)) = field(default_factory=lambda: None) + box_size: (xyz_point, type(None)) = field(default_factory=lambda: None) + exhaustiveness: int = field(default_factory=lambda: 32) + num_modes: int = field(default_factory=lambda: 20) + energy_range: int = field(default_factory=lambda: 5) + extend: int = field(default_factory=lambda: 10) + + def __str__(self): + return f"""the class of autodock vina config file:\n\nreceptor: {self.receptor}\nligand: {self.ligand}\ncenter: {self.center}\nbox_size: {self.box_size}\noutpdbqtfile: {self.outpdbqtfile}\nexhaustiveness: {self.exhaustiveness}\nnum_modes: {self.num_modes}\nenergy_range: {self.energy_range}\nextend: {self.extend}\n""" + + def __post_init__(self): + if (self.center == None) and (self.box_size == None): + m = molecule(self.receptor) + box = m.compute_box(extending = self.extend) + self.center = box['center'] + self.box_size = box['box_size'] + + def to_dict(self)->dict: + return {'receptor': self.receptor, + 'ligand': self.ligand, + 'center': self.center, + 'box_size': self.box_size, + 'outpdbqtfile': self.outpdbqtfile, + 'exhaustiveness': self.exhaustiveness, + 'num_modes': self.num_modes, + 'energy_range': self.energy_range} + + def to_txt(self, file:Path, + cx = None, + cy = None, + cz = None, + sx = None, + sy = None, + sz = None, + exhaustiveness = None, + num_modes = None, + energy_range = None, + absolute_path=False) -> None: + """to_txt save config to file supported autodock vina 1.1.2 and 1.2.3 binary file + + use: vina --config config.txt + + Arguments: + file {file path} -- out put file + """ + if absolute_path: + receptor_path = self.receptor.absolute().as_posix() + ligand_path = self.ligand.absolute().as_posix() + outpdbqtfile_path = self.outpdbqtfile.absolute().as_posix() + else: + receptor_path = self.receptor.as_posix() + ligand_path = self.ligand.as_posix() + outpdbqtfile_path = self.outpdbqtfile.as_posix() + exhaustiveness = exhaustiveness if exhaustiveness else self.exhaustiveness + num_modes = num_modes if num_modes else self.num_modes + energy_range = energy_range if energy_range else self.energy_range + cx = cx if cx else self.center.x + cy = cy if cy else self.center.y + cz = cz if cz else self.center.z + sx = sx if sx else self.box_size.x + sy = sy if sy else self.box_size.y + sz = sz if sz else self.box_size.z + content = f''' +receptor = {receptor_path} +ligand = {ligand_path} + +center_x = {cx} +center_y = {cy} +center_z = {cz} + +size_x = {sx} +size_y = {sy} +size_z = {sz} + + +exhaustiveness = {exhaustiveness} + +num_modes = {num_modes} + +energy_range = {energy_range} + +out = {outpdbqtfile_path} +''' + file,file_dir = Path(file), Path(file).parent + if not file_dir.exists(): file_dir.mkdir(parents=True) + with open(file.absolute().as_posix(), 'w', encoding='utf-8') as f: + f.write(content) \ No newline at end of file diff --git a/vinautil/vinautil/vutils/docker.py b/vinautil/vinautil/vutils/docker.py new file mode 100644 index 0000000..9c4a458 --- /dev/null +++ b/vinautil/vinautil/vutils/docker.py @@ -0,0 +1,85 @@ +#!/usr/bin/env python +# -*- encoding: utf-8 -*- +''' +@file :docker +@Description: : python调用docker相关类 +@Date :2023/1/5 19:19:02 +@Author :lyzeng +@mail :pylyzeng@gmail.com +@version :1.0 +''' +import docker +from docker.tls import TLSConfig +from pathlib import Path +from dataclasses import dataclass, field + +''' +docker -H=tcp://remote-server-ip:2376 --tlsverify --tlscacert=ca.pem --tlscert=cert.pem --tlskey=key.pem version +docker daemon --tls --tlscert=/path/to/server-cert.pem --t +''' + +''' +# 语法参考使用: +mypassphrase = 'TLSdocker' +client = docker.DockerClient(base_url='tcp://remote-server-ip:2376', tls=True, + tls_client_key='ca-key.pem', tls_client_key_passphrase=mypassphrase, + tls_client_cert='ca.pem') +# 配置TLS连接 +tls_config = TLSConfig( + client_cert=('/path/to/client-cert.pem', '/path/to/client-key.pem'), + ca_cert='/path/to/ca.pem', + verify=True +) +# 连接到Docker守护进程 +client = docker.DockerClient(base_url='tcp://localhost:2376', tls=tls_config, tls_client_key_passphrase=mypassphrase) +# 执行docker操作 +project_name = "project_name" +number_of_solutions = "number_of_solutions" +best_PDB = "best_PDB" +restraint = "restraint" + +output = client.containers.run( + "pegi3s/pydock3", + "bash -c './run_ftdock {} {} {} {}'".format(project_name, number_of_solutions, best_PDB, restraint), + volumes={'/your/data/dir': {'bind': '/data', 'mode': 'rw'}}, + remove=True, +) +print(output) +''' +@dataclass() +class DockerClient(): + ''' + docker client + # 配置TLS连接docker client + tls_config = TLSConfig( + client_cert=('/path/to/client-cert.pem', '/path/to/client-key.pem'), + ca_cert='/path/to/ca.pem', + verify=True + ) + ''' + base_url: str # for mmme server is 'tcp://192.168.100.110:12376' + # tls: TLSConfig = field(default=None) + # passphrase: str = field(default=None) + + def __post_init__(self): + # self.client = docker.DockerClient(base_url=self.base_url, tls=self.tls, tls_client_key_passphrase=self.passphrase) + self.client = docker.DockerClient(base_url=self.base_url) + + def run(self, image="pegi3s/pydock3", command, volumes, remove=True): + ''' + run docker container and return output(rm container after run) +output = client.containers.run( + "pegi3s/pydock3", + "bash -c './run_ftdock {} {} {} {}'".format(project_name, number_of_solutions, best_PDB, restraint), + volumes={'/your/data/dir': {'bind': '/data', 'mode': 'rw'}}, + remove=True, +) +print(output) + ''' + output = self.client.containers.run( + image, + command, + volumes=volumes, + remove=remove + ) + return output diff --git a/vinautil/vinautil/vutils/errors/__init__.py b/vinautil/vinautil/vutils/errors/__init__.py new file mode 100644 index 0000000..bec26eb --- /dev/null +++ b/vinautil/vinautil/vutils/errors/__init__.py @@ -0,0 +1,10 @@ +#!/usr/bin/env python +# -*- encoding: utf-8 -*- +''' +@file :__init__ +@Description: : +@Date :2023/4/9 17:35:17 +@Author :lyzeng +@mail :pylyzeng@gmail.com +@version :1.0 +''' diff --git a/vinautil/vinautil/vutils/errors/covalent_coord.py b/vinautil/vinautil/vutils/errors/covalent_coord.py new file mode 100644 index 0000000..a2bf4a4 --- /dev/null +++ b/vinautil/vinautil/vutils/errors/covalent_coord.py @@ -0,0 +1,17 @@ +#!/usr/bin/env python +# -*- encoding: utf-8 -*- +''' +@file :covalent_coord +@Description: : +@Date :2023/4/9 17:35:40 +@Author :lyzeng +@mail :pylyzeng@gmail.com +@version :1.0 +''' + +class CustomError(Exception): + def __init__(self, message): + self.message = message + + def __str__(self): + return f"CustomError: {self.message}" diff --git a/vinautil/vinautil/vutils/hcovdock_crawler.py b/vinautil/vinautil/vutils/hcovdock_crawler.py new file mode 100644 index 0000000..2a9c2a1 --- /dev/null +++ b/vinautil/vinautil/vutils/hcovdock_crawler.py @@ -0,0 +1,98 @@ +#!/usr/bin/env python +# -*- encoding: utf-8 -*- +''' +@file :hcovdock_crawler +@Description: : 爬虫分子对接HcovDock [文献](https://pubmed.ncbi.nlm.nih.gov/36573474/) [10.1093/bib/bbac559](https://doi.org/10.1093/bib/bbac559) +@Date :2023/2/23 10:21:53 +@Author :lyzeng +@mail :pylyzeng@gmail.com +@version :1.0 +''' + +import asyncio +from playwright.async_api import async_playwright +from playwright.sync_api import Playwright, sync_playwright, expect +from pathlib import Path +from time import sleep, strftime +import pickle + + + +def submit_one_task(ligand_file: Path, receptor_file: Path, residue: str, + receptor_mail:str='pylyzeng@gmail.com',url="http://huanglab.phys.hust.edu.cn/hcovdock", header_string = "Task"): + with sync_playwright() as playwright: + browser = playwright.chromium.launch(headless=False) + context = browser.new_context() + page = context.new_page() + page.goto(url, timeout=100000) + page.wait_for_load_state('networkidle') + + # 上传receptor + input_locator = page.locator('#receptor') + input_locator.set_input_files(receptor_file.as_posix()) + + # 等待上传完成 + page.wait_for_selector('#residue_option') + sleep(3) + + # 定位下拉框元素 + select_locator = page.select_option('#residue_option', residue) + print(select_locator) + + # 上传ligand + input_locator = page.locator('#ligand') + input_locator.set_input_files(ligand_file.as_posix()) + + # 等待上传完成 + page.wait_for_selector('#residue_option') + sleep(3) + + page.fill('#emailaddress', receptor_mail) + + date_str = header_string + strftime("%Y%m%d") + taskname = date_str + receptor_file.stem.split('.')[0] + page.locator("input[name=\"jobname\"]").click() + page.locator("input[name=\"jobname\"]").fill(taskname) + page.get_by_role("checkbox").check() + page.get_by_role("button", name="Submit").click() + print( + f''' Task submit: +taskname: {taskname} +receptor: {receptor_file.name} +ligand: {ligand_file.name} +mail: {receptor_mail} ''' + ) + + + + +if __name__ == '__main__': + # asyncio.run(main()) + root_path = Path('C:\\Users\\Admin\\OneDrive\\2020级研曾令宇\\实验数据\\hcovdock_benchmark') + l_path = root_path.joinpath('molecular_polar_pdb') + r_path = root_path.joinpath('receptor_pdb_scar') + lfiles = list(l_path.glob('*.pdb')) + rfiles = list(r_path.glob('*.scar.pdb')) + # 读取之前提交的任务信息 + record_file = Path('task_record.pkl') + if record_file.exists(): + with open(record_file, 'rb') as f: + submitted_tasks = pickle.load(f) + else: + submitted_tasks = [] + for r,l in zip(rfiles, lfiles): + t = r.stem.split('.')[0].split('-')[1].split('_')[1:3] + target_residue = f"{t[0]}:GLY{t[1]}" + task_info = (r.name, l.name, target_residue) + if task_info in submitted_tasks: + print(f"Task {task_info} has already been submitted, skipped.") + continue + submit_one_task(ligand_file=l, receptor_file=r, residue=target_residue, receptor_mail='pylyzeng@gmail.com', header_string='Newtask_gmail-') + + + # 将新提交的任务信息记录下来 + submitted_tasks.append(task_info) + with open(record_file, 'wb') as f: + pickle.dump(submitted_tasks, f) + + sleep(8) diff --git a/vinautil/vinautil/vutils/ledock_utils.py b/vinautil/vinautil/vutils/ledock_utils.py new file mode 100644 index 0000000..4091c22 --- /dev/null +++ b/vinautil/vinautil/vutils/ledock_utils.py @@ -0,0 +1,103 @@ +#!/usr/bin/env python +# -*- encoding: utf-8 -*- +''' +@file :ledock_utils.py +@Description: : ledock准备工作,分割任务 +@Date :2023/2/22 18:47:25 +@Author :lyzeng +@mail :pylyzeng@gmail.com +@version :1.0 +''' +from pathlib import Path +from typing import List +import subprocess + +def generate_ledock_file(receptor:Path, l_list_outfile:Path, out_file:Path, l_list: Path, + x=[0, 0], y=[0, 0], z=[0, 0], n_poses=20,rmsd=1.0): + rmsd = str(rmsd) + x = [str(x) for x in x] + y = [str(y) for y in y] + z = [str(z) for z in z] + n_poses = str(n_poses) + l_list_outfile.write_text(l_list.as_posix()) # only for one molecular docking, otherwise change this part + + file = [ + 'Receptor\n', + receptor.as_posix() + '\n\n', + 'RMSD\n', + rmsd + '\n\n', + 'Binding pocket\n', + x[0], ' ', x[1], '\n', + y[0], ' ', y[1], '\n', + z[0], ' ', z[1], '\n\n', + 'Number of binding poses\n', + n_poses + '\n\n', + 'Ligands list\n', + l_list_outfile.as_posix() + '\n\n', + 'END'] + print(f'save ledock config: {out_file.as_posix()}') + out_file.write_text(''.join(file)) + +def box_vina_to_ledock(center_x, center_y, center_z, size_x, size_y, size_z): + # 计算 LeDockBox 中的坐标值 + minX = round(center_x - size_x / 2, 2) + maxX = round(center_x + size_x / 2, 2) + minY = round(center_y - size_y / 2, 2) + maxY = round(center_y + size_y / 2, 2) + minZ = round(center_z - size_z / 2, 2) + maxZ = round(center_z + size_z / 2, 2) + + # 构造 LeDockBox 字符串 + LeDockBox = "*********LeDock Binding Pocket*********\n" + \ + "Binding pocket\n%.1f %.1f\n%.1f %.1f\n%.1f %.1f\n" % (minX, maxX, minY, maxY, minZ, maxZ) + print(LeDockBox) + + return minX, maxX, minY, maxY, minZ, maxZ + +def config_vina2ledock(vina_file: Path, out_path: Path): + # test one instance + if not vina_file.exists(): raise FileNotFoundError("File not found: " + vina_file.as_posix()) + # extract box infos + content_list = [i for i in vina_file.read_text().splitlines() if i] # read vina config + center_x, center_y, center_z, size_x, size_y, size_z = [float(content.split(" = ")[1]) for content in + content_list[2:8]] + center_x = round(center_x, 2) + center_y = round(center_y, 2) + center_z = round(center_z, 2) + size_x = round(size_x, 2) + size_y = round(size_y, 2) + size_z = round(size_z, 2) + minX, maxX, minY, maxY, minZ, maxZ = box_vina_to_ledock(center_x, center_y, center_z, size_x, size_y, size_z) + # extract ligand infos + # 去掉_to_pdbqt 将_ploar.pdbqt变为_polar.pdb + lig_content = content_list[1].split('=')[1].strip().replace('molecular_polar_mol2_to_pdbqt', + 'molecular_polar_mol2', ).replace('_ploar.pdbqt', + '_polar.mol2') + lig_content = out_path.parent.joinpath(lig_content) + if not lig_content.exists(): raise FileNotFoundError("File not found: " + lig_content.as_posix()) + # extract protein infos + pro_content = out_path.parent.joinpath(content_list[0].split('=')[1].strip().replace('_pdbqt', '').replace('pdbqt', 'pdb')) + if not pro_content.exists(): raise FileNotFoundError("File not found: " + pro_content.as_posix()) + liglst = out_path.joinpath(vina_file.stem + '.list') + ledock_config_file = out_path.joinpath(vina_file.stem + '.in') + generate_ledock_file(receptor=pro_content, x=[minX, maxX], y=[minY, maxY], z=[minZ, maxZ], n_poses=20, + l_list=lig_content, l_list_outfile=liglst, + out_file=ledock_config_file) + +def ledock_split(ledock_results: Path, option='-spli'): + ledock_binary = '/home/lyzeng/ledock_benchmark/ledock_linux_x86' + res = subprocess.run([ledock_binary, option, ledock_results.as_posix()]) + if res.returncode != 1: + return res + +def split_main(): + ledock_results = list(Path('/home/lyzeng/ledock_benchmark/molecular_polar_mol2').glob('*.dok')) + file = ledock_results[0] + split_res = list(map(lambda x: x!=None ,[ledock_split(i) for i in ledock_results])) + +if __name__ == '__main__': + ... + + + + diff --git a/vinautil/vinautil/vutils/obabel.py b/vinautil/vinautil/vutils/obabel.py new file mode 100644 index 0000000..2171397 --- /dev/null +++ b/vinautil/vinautil/vutils/obabel.py @@ -0,0 +1,231 @@ +from dataclasses import dataclass, field +from pathlib import Path +from openbabel import pybel +from typing import Optional, NoReturn, List, Union, Dict, Tuple +import re +import tempfile, os +import numpy as np + + +@dataclass +class PDBQTparser: + file: Path + module_lst: List[List[str]] = field(init=False) + fmt: str = field(default_factory=lambda: 'pdbqt') + + def __post_init__(self): + self.module_lst = self.get_modules() + + def __repr__(self): + return f"{self.__class__.__name__}({self.file})" + + def __str__(self): + return f"offer some method for {self.file}" + + def get_modules(self, pose_num: int = None) -> Union[List[str], str]: + module_lst = [] + module = [] + txt = self.file.read_text() + module_lst = re.findall(r'MODEL\s+\d+.*?ENDMDL', txt, re.DOTALL) + if pose_num is None: + return module_lst + else: + return module_lst[pose_num] + + @property + def module_num(self) -> int: + return len(self.module_lst) + + @property + def scores(self) -> Dict[str,float]:# get affinity from pdbqt file + vina_match = r"VINA RESULT:\s+(-\d+\.\d+)" + model_match = r"MODEL\s+(\d+)" + return {str(re.search(model_match, i).group(1)): float(re.search(vina_match, i).group(1)) for i in self.module_lst} + + @property + def score_density(self) -> Dict[str,float]: + return {k: v/self.atomnum() for k,v in self.scores.items()} + + @property + def rerank_scores(self) -> List[Tuple[float, int]]: + l = self.scores.values() + s = sorted(list(set(l)), reverse=False) + return [(i, ii) for i in l for ii,jj in enumerate(s) if i == jj] + + @property + def energies(self): + return [float(i) for i in self.scores.values()] + + @staticmethod + def get_coord_dict(fmt: str, file: Path, removeHs: bool = True) -> dict: + molH = next(pybel.readfile(format=fmt, filename=file.as_posix())) # read first molecule + if removeHs: molH.OBMol.DeleteHydrogens() + return {atom.idx: atom.coords for atom in molH} + + def atomnum(self, removeHs: bool = True) -> int: + mols = pybel.readfile(format=self.fmt, filename=self.file.as_posix()) + if removeHs: + mols = [i.OBMol.DeleteHydrogens() and i for i in mols] + else: + mols = [i for i in mols] + atom_counts = [len(mol.atoms) for mol in mols] + if all(count == atom_counts[0] for count in atom_counts): + # print("All structures have the same number of atoms.") + return atom_counts[0] + else: + raise ValueError("Structures have different numbers of atoms.") + + @staticmethod + def pose_coords(pdbqt_string: str, remove_H = True) -> np.array: + mol_pose = pybel.readstring('pdbqt', pdbqt_string) + if remove_H: mol_pose.OBMol.DeleteHydrogens() + # 创建一个空的numpy数组,用来存储坐标信息 + coords = np.empty((len(mol_pose.atoms), 3)) + # 遍历mol_pose1的原子,把每个原子的坐标赋值给coords数组 + for i, atom in enumerate(mol_pose.atoms): + coords[i] = atom.coords + return coords + + @staticmethod + def rmsd(pose_ref:np.array, pose:np.array): + # 计算两个数组之间的差值,然后对每个元素平方 + diff = pose_ref - pose + diff_squared = diff ** 2 + assert len(pose_ref) == len(pose) + N = len(pose_ref) + # 对平方后的数组求和,然后除以原子个数N,最后开平方根 + sum_diff_squared = diff_squared.sum () + rmsd = np.sqrt (sum_diff_squared / N) + return rmsd + + + +@dataclass +class PDBQTtoMol2: + ''' + pdbqt 还原为mol2格式,还原健价信息, 删除所有氢元素方便后续计算RMSDß + ''' + original_mol2_file: str + undock_pdbqt: str + docked_pdbqt: List[str] # autodock vina outputs + pybel_mol: (List[pybel.Molecule], type(None)) = field(init=False) + + def __post_init__(self): + docked_pdbqt_pybel_mol = [pybel.readstring('pdbqt', i) for i in self.docked_pdbqt] + self.pybel_mol = self.get_pybel_mol(docked_pdbqt_pybel_mol) + + @staticmethod + def get_coord_dict(fmt: str, file_string: str, removeHs: bool = True) -> dict: + molH = pybel.readstring(fmt, file_string) # read first molecule + if removeHs: molH.OBMol.DeleteHydrogens() + return {atom.idx: atom.coords for atom in molH} + + def get_pybel_mol(self, docked_pdbqt_pybel_mol) -> List[pybel.Molecule]: + ''' + mind refence https://github.com/ag83/pdbqt-to-mol2/blob/be40bdda20ffb96cd3d173accf77e7a2da9a49aa/convert_to_mol2.py#L15 + convert autodock vina dock results restore to mol2 format, which include bond infomations + :return: pybel.Molecule or None + ''' + undocked_pdbqt = self.get_coord_dict('pdbqt', self.undock_pdbqt) + original_mol2 = self.get_coord_dict('mol2', self.original_mol2_file) + if len(docked_pdbqt_pybel_mol) == 1: + i_coords = {atom.idx: atom.coords for atom in docked_pdbqt_pybel_mol[0]} + mol = self.__update_coordinates(docked_pdbqt=i_coords, undocked_pdbqt=undocked_pdbqt, original_mol2=original_mol2) + mol.OBMol.DeleteHydrogens() + return [mol] + elif len(docked_pdbqt_pybel_mol) > 1: + lst = [] + for i in docked_pdbqt_pybel_mol: + i.OBMol.DeleteHydrogens() + i_coords = {atom.idx: atom.coords for atom in i} + lst.append(self.__update_coordinates(docked_pdbqt=i_coords, undocked_pdbqt=undocked_pdbqt, original_mol2=original_mol2)) + return lst + else: + return [] + + def __update_coordinates(self, undocked_pdbqt: Dict, docked_pdbqt: Dict, original_mol2: Dict) -> Optional[pybel.Molecule]: + assert len(undocked_pdbqt) == len(docked_pdbqt) == len( + original_mol2), f'Not equal number of atoms in molecules\n' \ + f'undocked_pdbqt: {len(undocked_pdbqt)}\n' \ + f'docked_pdbqt: {len(docked_pdbqt)}\n' \ + f'original_mol2: {len(original_mol2)}' + original_coord = {} + for key in original_mol2: + coord_update = [round(x, 3) for x in original_mol2[key]] + coord_update = tuple(coord_update) + original_coord.update({key: coord_update}) + coord_map = {} + for idx, coord in original_coord.items(): + # potential bottleneck for large molecules + for ind, coordinates in undocked_pdbqt.items(): + n = 0 + if coord[0] == coordinates[0]: + n = n + 1 + if coord[1] == coordinates[1]: + n = n + 1 + if coord[2] == coordinates[2]: + n = n + 1 + else: + if coord[2] == coordinates[2]: + n = n + 1 + else: + if coord[1] == coordinates[1]: + n = n + 1 + if coord[2] == coordinates[2]: + n = n + 1 + else: + if coord[2] == coordinates[2]: + n = n + 1 + if n == 3: + coord_map.update({idx: ind}) + elif n == 2: + if idx in coord_map: + pass + else: + coord_map.update({idx: ind}) + elif n == 1: + if idx in coord_map: + pass + else: + coord_map.update({idx: ind}) + else: + pass + if len(coord_map) == len(original_mol2): + coord_conform = {} + for index1, index2 in coord_map.items(): + coord_conform.update({index1: docked_pdbqt.get(index2)}) + mol2 = pybel.readstring('mol2', self.original_mol2_file) + mol2.OBMol.DeleteHydrogens() + for atom in mol2: + atom.OBAtom.SetVector(coord_conform.get(atom.idx)[0], coord_conform.get(atom.idx)[1], + coord_conform.get(atom.idx)[2]) + self.pybel_mol = mol2 + return mol2 + else: + raise ValueError('Lost coordinates in mapping') + + def to_file(self, out_file: Path, fmt: str = 'mol2') -> NoReturn: + if isinstance(self.pybel_mol, list): + for i in self.pybel_mol: + with pybel.Outputfile(fmt, out_file.as_posix(), overwrite=True) as f: + f.write(i) + elif isinstance(self.pybel_mol, pybel.Molecule): + self.pybel_mol.write(fmt, out_file.as_posix(), overwrite=True) + else: + raise print('No pybel.Molecule found') + + def to_string(self, fmt: str = 'mol2') -> Union[List[str], str]: + if isinstance(self.pybel_mol, list): + return [i.write(fmt) for i in self.pybel_mol] + elif isinstance(self.pybel_mol, pybel.Molecule): + return self.pybel_mol.write(fmt) + else: + raise print('No pybel.Molecule found') + +def mol2pdb(read_file: Path, out_file: Path, s_fmt='mol2', t_fmt='pdb' ): + # mol2 文件格式转化为pdb格式 + mols = list(pybel.readfile(format=s_fmt, filename=read_file.as_posix())) + if len(mols) == 1: + mol = mols[0] + mol.write(t_fmt, out_file.as_posix(), overwrite=True) + diff --git a/vinautil/vinautil/vutils/pdbapi.py b/vinautil/vinautil/vutils/pdbapi.py new file mode 100644 index 0000000..4f784ed --- /dev/null +++ b/vinautil/vinautil/vutils/pdbapi.py @@ -0,0 +1,29 @@ +#!/usr/bin/env python +# -*- encoding: utf-8 -*- +''' +@file :pdbapi +@Description: : +@Date :2023/6/26 18:25:07 +@Author :lyzeng +@mail :pylyzeng@gmail.com +@version :1.0 +''' +from dataclasses import dataclass, field + + +@dataclass +class PDBAPI: + """https://data.rcsb.org/redoc/ + """ + # entry_id: str = field(default='1a4w', metadata={'help': 'PDB entry id'}) + # asym_id: str = field(default='A', metadata={'help': 'asym id'}) + comp_id: str = field(default='ZYA', metadata={'help': 'comp id'}) + + def querymole(): + # 定义API的URL + url = f"https://data.rcsb.org/rest/v1/core/chemcomp/{comp_id}" + # 发送GET请求 + response = requests.get(url) + # 将返回的JSON数据转换为Python字典 + data = response.json() + return data \ No newline at end of file diff --git a/vinautil/vinautil/vutils/pdbparse.py b/vinautil/vinautil/vutils/pdbparse.py new file mode 100644 index 0000000..191f2e4 --- /dev/null +++ b/vinautil/vinautil/vutils/pdbparse.py @@ -0,0 +1,275 @@ +from pathlib import Path +import gzip +import re +import time +from dataclasses import dataclass, field +from typing import List, Tuple, Dict, Union, Optional, NoReturn, Generator, Iterable, Callable, Any, Set +from enum import Enum +from myutils.pdbparse import Bdp +""" +prepare pdb file: +rsync -rlpt -v -z --delete --port=33444 rsync.rcsb.org::ftp_data/structures/divided/pdb/ /home/mmme/alist/local_storage/PDBdatabase/pdb/ +rsync -rlpt -v -z --delete --port=33444 rsync.rcsb.org::ftp_data/structures/divided/mmCIF/ /home/mmme/alist/local_storage/PDBdatabase/cif/ +transfer pdb file to pdbqt file: +ADFRsuite-1.0/bin/relpath_prepare_receptor4.bat -r test.scar.pdb -o test.scar.pdbqt -A checkhydrogens +""" + +# here = Path(__file__).parent.resolve() +# import sys +# sys.path.insert(0, here) +from utils.typecheck import typeassert +from utils.pymol import api + +class CovalentSourceType(Enum): + PYMOL = 0 # use pymol show lines see covalent bond + PAPER = 1 # from sci paper get covalent information + LINK = 2 # from pdb file LINK record get covalent bond + def __str__(self): + return 'how to judge covalent bond is class CovalentType' + def is_pymol(self) -> bool: + return self == CovalentSourceType.PYMOL + + def is_paper(self) -> bool: + return self == CovalentSourceType.PAPER + + def is_link(self) -> bool: + return self == CovalentSourceType.LINK + +class ConvalentType: + SMALL_MOLECULE_SIDE = 'molecule_side' + PROTEIN_SIDE = 'protein_side' + def __init__(self, type: str): + if type not in (self.SMALL_MOLECULE_SIDE, self.PROTEIN_SIDE): + raise ValueError(f"Invalid ConvalentType: {type}") + self.type = type + + def __repr__(self): + return f"ConvalentType({self.type!r})" + + def __doc__(self): + return f"ConvalentType representing a covalent bond on either the molecule side or the protein side." + + def __str__(self): + return self.type + + @classmethod + def is_small_molecule_side(cls, val: Union[str, 'ConvalentType']): + return val == ConvalentType.SMALL_MOLECULE_SIDE or val == ConvalentType.SMALL_MOLECULE_SIDE + + @classmethod + def is_protein_side(cls, val: Union[str, 'ConvalentType']): + return val == ConvalentType.PROTEIN_SIDE or val == ConvalentType.PROTEIN_SIDE + +@dataclass +class CovalentMsg(): + sourcetype: CovalentSourceType + content: Optional[List[Dict]] + """type: Union[ConvalentType, type(None)] = field(default=None) +""" + def is_pymol(self) -> bool: + return self.sourcetype == CovalentSourceType.PYMOL + + def is_paper(self) -> bool: + return self.sourcetype == CovalentSourceType.PAPER + + def is_link(self) -> bool: + return self.sourcetype == CovalentSourceType.LINK + +@dataclass +class molecule(): + """ + A class representing a molecule. + + Attributes: + resn (str): The residue name of the molecule. + auth_label_chain (str): The author-assigned label and chain identifier of the molecule. + seq (int): The sequence number of the molecule. + cov (Union[CovalentMsg, None]): The covalent bond information of the molecule (if any). Default value is None. + """ + resn: str + auth_label_chain: str + seq: int + cov: Union[CovalentMsg, None] = field(default=None) + + +@typeassert(file=Path) +@dataclass +class pdb_parse(api): + """ + Class for parsing PDB files. + file: Path + The file path of the PDB file to be parsed. + pid: str + The PDB ID of the molecule. This field is initialized to be empty and will be filled when the PDB file is parsed. + file_content: List[str] + A list of strings representing the lines of the PDB file. This field is initialized to be empty and will be filled when the PDB file is parsed. + hetatm: Tuple[str] + A tuple of strings representing HETATM records in the PDB file. This field is initialized to be empty and will be filled when the PDB file is parsed. + organic: Tuple[str] + A tuple of strings representing organic HETATM records in the PDB file. This field is initialized to be empty and will be filled when the PDB file is parsed. + metals: Tuple[str] + A tuple of strings representing metal HETATM records in the PDB file. This field is initialized to be empty and will be filled when the PDB file is parsed. + inorganic: Tuple[str] + A tuple of strings representing inorganic HETATM records in the PDB file. This field is initialized to be empty and will be filled when the PDB file is parsed. + chain: Tuple[str] + A tuple of strings representing chain records in the PDB file. This field is initialized to be empty and will be filled when the PDB file is parsed. + mole: List[molecule] + A list of `molecule` objects representing the molecules in the PDB file. This field is initialized to be empty and will be filled when the PDB file is parsed. + """ + file: Path + pid: str = field(init=False) + file_content: List[str] = field(init=False) + hetatm: Tuple[str] = field(init=False) + organic: Tuple[str] = field(init=False) + metals: Tuple[str] = field(init=False) + inorganic: Tuple[str] = field(init=False) + chain: Tuple[str] = field(init=False) + mole: List[molecule] = field(init=False) + pymol_start_args: List[str] = field(default_factory=lambda: ["pymol", "-M", "-A1"]) + + def __repr__(self): + return f"{self.__class__.__name__}('{self.file.as_posix()}')" + def __post_init__(self): + self.pid = self.match_pid(string=self.file.name) + self.file_content = list(self.get_file_content()) + self.pymol_start(args=self.pymol_start_args) + self.load_file() # load file in pymol + self.__init_infos() # init base infos + + def __init_infos(self) -> NoReturn: + self.chain = list(set(self.pymol_print_exec(atom_attribute='chain'))) + self.organic = self.pymol_get_resn(object_name='organic') + self.hetatm = self.pymol_get_resn(object_name='hetatm') + self.metals = self.pymol_get_resn(object_name='metals') + self.inorganic = self.pymol_get_resn(object_name='inorganic') + self.mole = self._get_molecule() + + def _get_molecule(self) -> List[molecule]: + lst = [] + for c in self.chain: + for m in self.organic: + # print(f'in {self.file.name}, try get {c} chain {m} molecule') + seq = self.pymol_get_sequence(resn=m, chain=c) + if len(seq) == 1 : # one molecule is in this chain + lst.append(molecule(resn=m, auth_label_chain=c, seq=seq, cov=self._search_covalent_bond())) + elif len(seq) > 1: # same chemical molecule have more than one molecule in this chain + for s in seq: + lst.append(molecule(resn=m, auth_label_chain=c, seq=s, cov=self._search_covalent_bond())) + else: + pass # the molecule not in this chain + return lst + def _search_covalent_bond(self, distance:float = 3.00) -> List[Dict[str, str]]: + """ + search covalent bond from pymol, around all molecule 3.00 ANG + """ + # CovalentSourceType.PYMOL + for c in self.chain: + for mol in self.organic: + pdb_str = self.pymol_get_around(resn=mol, chain=c, distance=distance) + # print(pdb_str) + search_cov = self.pdbstr_covalent(pdb_str=pdb_str) + # print(search_cov) + return search_cov + + @staticmethod + def pdbstr_covalent(pdb_str: str) -> List[Dict[str, str]]: + """ + get covalent bond from pdb string, bond connect ATOM line and HETATM line + :param pdb_str: + :return: list of dict + """ + pdbstr_lines = pdb_str.splitlines() + cov_bonds = [] + for line in pdbstr_lines: + if line.startswith('CONECT'): + conect_info = line + if len(line.split()) == 3: + atoms = list(map(lambda x: int(x), line.split()[1:])) + atom_lines = list( + filter(lambda x: x.startswith('ATOM') or x.startswith('HETATM'), pdbstr_lines)) + atoms_info = list(map(lambda x: x.strip(), atom_lines)) + lst = [conect_info, atoms_info[atoms[0] - 1], atoms_info[atoms[1] - 1]] + cov_bond = {i.split()[0]:i for i in lst} + cov_bonds.append(cov_bond) + return_value = [i for i in cov_bonds if len(i.keys()) == 3] + return return_value + + def _get_file_content_iterable(self) -> Iterable[str]: + with open(self.file.as_posix(), 'r') as f: + for line in f: + yield line + + def get_file_content(self) -> Iterable[str]: + if '.ent' and '.gz' in self.file.suffixes: + self._ungzip_pdb_gz() + return self._get_file_content_iterable() + elif '.pdb' in self.file.suffix: + return self._get_file_content_iterable() + elif '.ent' in self.file.suffix: + return self._get_file_content_iterable() + else: + raise ValueError(f"File type error: {self.file}, only support .ent, .pdb or .ent.gz") + + def _ungzip_pdb_gz(self) -> NoReturn: + fmt = 'pdb' + with gzip.open(self.file.as_posix(), 'rt') as f_in: + if self.pid is None: + ungzip_file_name = f"unkown_PDB_ID_{int(time.time())}.{fmt}" # 如果达到秒级运算,则需要修改删除int,否则会覆盖文件 + else: + ungzip_file_name = f"{self.pid}.{fmt}" + ungzip_file = self.file.parent.joinpath(ungzip_file_name) + with open(ungzip_file.as_posix(), 'w', encoding='utf-8') as f_out: + for line in f_in: + f_out.write(line) + self.file = ungzip_file + + def _get_link_info(self) -> List[str]: + return self._get_start_with("LINK") + + def _get_start_with(self, start_word: str) -> List[str]: + return [line for line in self.file_content if line.startswith(start_word)] + + @staticmethod + def match_pid(string) -> Optional[str]: + # pattern = r'(pdb)?(?P[A-Za-z\d]{4}).(ent|pdb)(.gz)?' + # ['r3w2.pdb', 'r3w2.pdb.gz', 'pdbr3w2.ent', 'pdbr3w2.ent.gz', 'pdbr3w2'] # pdbid = r3w2 test + pattern = r'(pdb)?(?P[A-Za-z\d]{4})(\.(ent|pdb)(\.gz)?)?' + match = re.match(pattern, string) + if match: + return match.group('pdbid') + return None + + def link_info(self): + pass + + +def get_gz_content(file: Path, ungzip:bool = True) -> str: + if '.gz' in file.suffixes: + with gzip.open(file.as_posix(), 'rt') as f: + content = f.read() + else: + raise ValueError(f"{file.as_posix()} is not a gzip file, or not suffix with '.gz'") + if ungzip: + rename = file.name.split('.')[0] + ungzip_file = file.parent.joinpath(rename).as_posix() + with open(ungzip_file, 'w', encoding='utf-8') as f: + f.write(content) + return content + + + + +""" +共价键记录不一定只在LINK开头的信息中,还有可能在非LINK开头的信息中,例如:REMARK 500 +PDB ID: 3sh8 +REMARK 500 THE FOLLOWING ATOMS ARE IN CLOSE CONTACT. +REMARK 500 +REMARK 500 ATM1 RES C SSEQI ATM2 RES C SSEQI DISTANCE +REMARK 500 OG SER A 70 C8 CED A 1 1.90 +REMARK 500 OG SER B 70 C8 CED B 1 2.03 +REMARK 500 +REMARK 500 REMARK: NULL +REMARK 500 +REMARK 500 GEOMETRY AND STEREOCHEMISTRY +REMARK 500 SUBTOPIC: COVALENT BOND LENGTHS +""" \ No newline at end of file diff --git a/vinautil/vinautil/vutils/pdbparse_module.py b/vinautil/vinautil/vutils/pdbparse_module.py new file mode 100644 index 0000000..6a5d26e --- /dev/null +++ b/vinautil/vinautil/vutils/pdbparse_module.py @@ -0,0 +1,805 @@ +#!/usr/bin/env python +# -*- encoding: utf-8 -*- +''' +@file :pdbparse.py +@Description: : hanlde PDB file +@Date :2022/10/13 15:28:49 update +@Author :hotwa +@version :1.0 +''' +# need biopython +import os, time +import pandas as pd +from Bio.PDB.PDBParser import PDBParser +from Bio.PDB import PDBIO, Select, parse_pdb_header, PDBList +from pathlib import Path +from pdbfixer import PDBFixer +from openmm.app import PDBFile + +try: + from .typecheck import typeassert +except (ImportError, ModuleNotFoundError, AttributeError, NameError, SyntaxError): + from typecheck import typeassert + +file_dir = os.path.dirname(os.path.realpath(__file__)) # 当前python文件的绝对路径 +path_obj = Path(file_dir) + + +# timestring = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) + +# local import data +def get_bonding(file='amino_bonding_atom.csv'): + file = path_obj.joinpath(file) + return pd.read_csv(file) + + +def get_complex(file='./new_complex_finish_update_pdb_site.xlsx', same_chain=False): + """ + :param file: xlsx file + :param same_chain: Chain_identifier_mole == Residue_chain + :return: + """ + file = path_obj.joinpath(file) + df = pd.read_excel(file) + if same_chain: + return df[df['Chain_identifier_mole'] == df['Residue_chain']] + else: + return df + + +class PDBLengthError(BaseException): + def __init__(self, arg): + self.arg = arg + + +class ChainSelect(Select): + def __init__(self, chain_string='A'): + super(Select, self).__init__() + self.chain_string = chain_string + + def accept_chain(self, chain): + if str(chain.__repr__()) == ''.format(self.chain_string): # judge chain name + return 1 + else: + return 0 + + +class ResidueSelect(Select): + """ResidueSelect ues by select + + [extended_summary] + + :param Select: [description] + :type Select: [type] + """ + + def __init__(self, residue): + super(Select, self).__init__() + self.residue = residue + + def accept_residue(self, residue): + if residue.get_resname() == self.residue: + return 1 + else: + return 0 + + +# ! error class for Bdp +class DoiError(BaseException): + def __init__(self, arg): + self.arg = arg + + +class ChainError(BaseException): + def __init__(self, arg): + self.arg = arg + + +@typeassert(path=Path, sid=str) +class Bdp(object): + __slots__ = ['path', 'sid', 'mole_id', 'mole_struct', 'create_path', '__dict__'] + + def __init__(self, path: Path, sid: str): + self.path = path + self.create_path = path.parent + self.sid = sid + self.mole_id = [] + self.mole_struct = [] + # 初始化数据 + self.read_formula() + + @property + def chainnum(self): + s = self.read_struct() + first_model = s[0] + return len(first_model) + + @property + def chainstr(self): + return self.getchain_str() + + @property + def headerinfos(self): + return parse_pdb_header(self.path) + + @property + def doi(self): + _journal = self.headerinfos['journal'] + _doi = _journal.split()[-1] + if _doi.startswith('10.'): + return _doi + else: + raise DoiError(f'current pdb does not doi, {_journal}') + + def cleanATOM(self, out_file=None, ext="_clean.pdb") -> Path: # from pyrosetta.toolbox import cleanATOM + """Extract all ATOM and TER records in a PDB file and write them to a new file. + + Args: + pdb_file (str): Path of the PDB file from which ATOM and TER records + will be extracted + out_file (str): Optional argument to specify a particular output filename. + Defaults to .clean.pdb. + ext (str): File extension to use for output file. Defaults to ".clean.pdb" + """ + pdb_file = self.path.as_posix() + # find all ATOM and TER lines + with open(pdb_file, "r") as fid: + good = [l for l in fid if l.startswith(("ATOM", "TER"))] + + # default output file to _clean.pdb + if out_file is None: + out_file = os.path.splitext(pdb_file)[0] + ext + + # write the selected records to a new file + with open(out_file, "w") as fid: + fid.writelines(good) + return Path(out_file) + + def fix_pdb(self): + path = self.path.parent.as_posix() + pdb_id = self.path.as_posix() + print("Creating PDBFixer...") + fixer = PDBFixer(pdb_id) + print("Finding missing residues...") + fixer.findMissingResidues() + + chains = list(fixer.topology.chains()) + keys = fixer.missingResidues.keys() + for key in list(keys): + chain = chains[key[0]] + if key[1] == 0 or key[1] == len(list(chain.residues())): + print("ok") + del fixer.missingResidues[key] + + print("Finding nonstandard residues...") + fixer.findNonstandardResidues() + print("Replacing nonstandard residues...") + fixer.replaceNonstandardResidues() + print("Removing heterogens...") + fixer.removeHeterogens(keepWater=True) + + print("Finding missing atoms...") + fixer.findMissingAtoms() + print("Adding missing atoms...") + fixer.addMissingAtoms() + print("Adding missing hydrogens...") + fixer.addMissingHydrogens(7) + print("Writing PDB file...") + + PDBFile.writeFile( + fixer.topology, + fixer.positions, + open(os.path.join(path, "%s_fixed_pH_%s.pdb" % (pdb_id.split('.')[0], 7)), "w"), keepIds=True) + return "%s_fixed_pH_%s.pdb" % (pdb_id.split('.')[0], 7) + + def residuesequence(self, label): # 'HETATM' 'ATOM' + df = self.transformtodataframe(path=self.path, first_label=label) + return set(df['ResidueSequence'].to_list()) + + def value_check(self): + try: + if len(self.mole_id) == len(self.mole_struct): + print('check pass!') + else: + raise ValueError('molecule identifier do not match formula identifier! manual check') + except: + raise ValueError + + @staticmethod + def read_line_list(first_column, path): + """read_line_list 读取第一列为first_column字符串的行,存为列表返回 + + [extended_summary] + + Arguments: + first_column {[string]} -- [pdb文件内容第一列常常为大写] + path {[string]} -- [路径] + + Returns: + [iter] -- [含有first_column字符串的生成器] + """ + stringiter = Bdp.stringlinesiter(file=path) + stringiter = map(lambda x: x.strip(), stringiter) + stringiter = filter(lambda x: bool(x), stringiter) # clean empty + return filter(lambda x: x.split()[0] == first_column, stringiter) + + @staticmethod + def stringlinesiter(file): + with open(file, 'r+', encoding='utf-8') as f: + yield from f.readlines() + + def mole_site(self, lig: str, chain: str) -> int: + readmolelist = self.split_molecule(residue=lig, chain=chain) + with open(readmolelist, 'r'): + df = Bdp.transformtodataframe(first_label='HETATM', path=readmolelist) + return df['ResidueSequence'].tolist()[0] + + def _split_molecule_from_chainlist(self, chainlist: list, s: object, residue: str) -> list: + # 对每条链上都与这个小分子结合的链上的结合信息都提取出来 + save_list, remove_list = [], [] + for i in chainlist: + io = PDBIO() + io.set_structure(s[0][i]) + sidc = self.sid.replace('.pdb', '') if '.pdb' in self.sid else self.sid + savename = f'{sidc}_{i}_{residue}.pdb' + create_dir = self.create_path.joinpath('split_molecule') + if not create_dir.exists(): create_dir.mkdir() + path = create_dir.joinpath(savename) + save_list.append(path.__str__()) + io.save(path.__str__(), ResidueSelect(residue)) + if path.exists(): + with open(path, 'r+') as f: + content = f.readline() + if content.strip() == 'END': + # print(f'{savename} this chain have not molecule remove it') + remove_list.append(path) # 该链没有小分子,删除文件 + for i in remove_list: + os.remove(i) # remove the empty molecule file + header_info = f'''--- +title: Split molecule from PDB +date: {time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())} +type: pdb-{self.sid} +--- +[TOC] +# Description +split {','.join(self.mole_id)} +''' + create_path_md = self.create_path.joinpath('split_molecule', 'README.md') + with open(create_path_md, 'w+') as f: + f.write(header_info) + return save_list + + def split_molecule(self, residue: str, chain: str = None, + keep_one_struct: bool = True) -> list: # ! 对于修饰残基的提取不完美 bug + ''' + split_molecule 从pdb文件中提取小分子 + :param residue: 小分子的名字,ligandID + :param chain: 小分子所在的链,如果为None则提取所有链 + :return: 返回小分子的路径 list + # ! 需要对提取的小分子构象二次检查,小分子构象可能有两个 residue_keep_struct + ''' + residue = str(residue) + s = self.read_struct() + chainlist = self.getchain_str() # 该蛋白获取所有链id编号 + if chain == None: + # 提取所有链上的小分子 + save_list = self._split_molecule_from_chainlist(chainlist=chainlist, s=s, residue=residue) + save_list = list(map(Bdp.residue_keep_struct_file, save_list)) + else: + if chain in chainlist: + chainlist = [chain] + else: + raise ValueError(f'chain ID :{chain} not in {self.sid}, file: {self.path}') + save_list = self._split_molecule_from_chainlist(chainlist, s, residue) + save_list = list(map(Bdp.residue_keep_struct_file, save_list)) + return save_list + + @staticmethod + def residue_keep_struct_file(file): # design for split_molecule method + # smart to recognize the chain + df_mol = Bdp.transformtodataframe(first_label='HETATM', path=file) + try: + construct_id = Bdp.get_residue_AtomLocationIndicator(df_mol) + id = construct_id[0] + except IndexError as e: + raise e(f'muti construct get failed, please check {file}') + df_item = Bdp.residue_keep_struct(file, id=id, first_label='HETATM') + if df_item.empty: + raise ValueError(f'error in {file} to get residue_keep_struct') + else: + out_file = Bdp.dataframe2pdb(df_item, file) + if Path(out_file).exists() and out_file == file: + return out_file # transform success + + @staticmethod + def clean_struct(file, label): # 清除所有多重构象的结构构象 AtomLocationIndicator 中保留第一个 + df0 = Bdp.transformtodataframe(path=file, first_label=label, keep_space=False) + seq_lst = list(set(df0['ResidueSequence'].to_list())) + df1 = Bdp.transformtodataframe(first_label=label, path=file, keep_space=True) + df = Bdp.transformtodataframe(first_label=label, path=file, keep_space=False) + # 搜集含有多重构象的残基序列,并且保留第一个构象 + remove_index_lst = [] # 需要删除的行(索引) + for site in seq_lst: + df_muta = df[df['ResidueSequence'] == str(site)] # 定位残基 + construct_id = Bdp.get_residue_AtomLocationIndicator(df_muta) + if construct_id: # true 有多重构象 + id = construct_id[0] + # 删除除了id构象的其他构象信息 + all_index = df_muta.index + keep_index = df_muta[df_muta['AtomLocationIndicator'] == id].index # 需要保留的侧链信息 + public_index = df_muta[df_muta['AtomLocationIndicator'] == ''].index # 主链信息 + remove_index = list(set(all_index) - set(keep_index) - set(public_index)) # 需要删除的索引信息 + remove_index_lst.extend(remove_index) + remove_index_data = pd.Index(data=remove_index_lst + , dtype='int64') + df_out = df1.drop(index=remove_index_data, axis=0) + df_out.index = [i for i in range(len(df_out))] + return df_out # 使用 Bdp.dataframe2pdb 转换为pdb文件 + + @staticmethod + def keep_residue(file, site, first_label: str = 'ATOM', id='auto'): + """ + keep_residue 智能保留残基 + :param file: read file, pdb file + :param site: residue site + :param first_label: 'ATOM' or 'HETATM' + :param id: keep construct id(like A, B……) default: first id(A) + :return: pd.DataFrame + """ + df1 = Bdp.transformtodataframe(first_label, path=file, keep_space=True) + df = Bdp.transformtodataframe(first_label, path=file, keep_space=False) + # 清除原始蛋白中的site位点信息 + df_remove = df[df['ResidueSequence'] != str(site)] + # 定位残基 + df_muta = df[df['ResidueSequence'] == str(site)] + df_muta_num = len(df_muta) + construct_id = Bdp.get_residue_AtomLocationIndicator(df_muta) + if construct_id: + if id == 'auto': + id = construct_id[0] + else: + if id not in construct_id: + raise ValueError(f'{file} 在 {site} 没有 {id} 构象信息, 可选构象为 {construct_id}') + id = id.upper() + else: + raise ValueError(f'{file} 在 {site} 没有构象信息') + # 删除除了id构象的其他构象信息 + all_index = df_muta.index + keep_index = df_muta[df_muta['AtomLocationIndicator'] == id].index # 需要保留的侧链信息 + public_index = df_muta[df_muta['AtomLocationIndicator'] == ''].index # 主链信息 + remove_index = list(set(all_index) - set(keep_index) - set(public_index)) + remove_index = pd.Index(data=remove_index, dtype='int64') + df_out = df1.drop(index=remove_index, axis=0) + df_out.index = [i for i in range(len(df_out))] + return df_out + + @staticmethod + def get_residue_AtomLocationIndicator(df): + # 返回残基的DataFrame构象的标识符号,没有则返回空列表 + lst = [i for i in list(set(df['AtomLocationIndicator'].tolist())) if i] + lst.sort() + return lst + + @staticmethod + def residue_keep_struct(file, id: str = 'A', first_label: str = 'ATOM'): + # 需要重新修改该函数,针对摸个特定的残基进行保留相应的构象,而不是同一保留所有的构象,例如A构象 + # 有些残基在A构象中不存在,但是在B构象中存在,会导致报错,残基会出现缺失的情况。 + ''' + residue_keep_construct 保留构象,删除其他构象 + :param file: read file, pdb file + :param id: keep construct id(like A,B……) + :param first_label: ‘’ATOM‘’ or ‘’HETATM‘’ + :return: pd.DataFrame + ''' + df = Bdp.transformtodataframe(first_label, path=file, keep_space=True) + remove_index = [] + for i in df.iterrows(): + atomsign = i[1]['AtomLocationIndicator'] + if bool(atomsign.strip()): + if atomsign == id.upper(): + df.loc[i[0], 'AtomLocationIndicator'] = ' ' + else: + print(atomsign) + remove_index.append(i[0]) + df = df.drop(index=df.index[remove_index], axis=0) + df.index = [i for i in range(len(df))] + return df + + def getchain_str(self): + """getchain_str retrun chain symbol name + + [extended_summary] + + :return: [description] + :rtype: list + """ + _l = [] + p = PDBParser(PERMISSIVE=1) + s = p.get_structure(self.sid, self.path) + chain_gennerate = s[0].copy().get_chains() + for i in chain_gennerate: + _l.append(i.id) + _l.sort() + return _l + + def site_exists(self, residue, num): + """site_exists 判断改蛋白的某个位点是否存在,并返回该位点所在的链 + + 用于判断结合位点的链在哪里 + + :param residue: 残基名称 + :type residue: string + :param num: 残基编号 + :type num: int + :return: chain + :rtype: list + """ + _l = [] + num = int(num) + residue = residue.strip().upper() + chainlist = self.getchain_str() + s = self.read_struct() + for i in chainlist: + try: + resname_from_pdb = s[0][i][num].get_resname() + if residue == resname_from_pdb: + _l.append(i) + except KeyError as e: + print(e, f'chain {i} not exist {residue}-{num}') + return _l + + def link_info(self, qurey_key=None): + dfl = self.transformtodataframe(first_label='LINK', path=self.path) + if qurey_key == None: + return dfl + else: + return dfl[(dfl['Residue_name1'] == qurey_key) | (dfl['Residue_name2'] == qurey_key)] + + def link_mole_site(self, mole_name: str): + mole_name = mole_name.upper() + df = self.link_info(qurey_key=mole_name) + if df.empty: + return None + elif len(df) == 1: + item = df.values.tolist()[0] + if item[3] == mole_name: + return item[5] + elif item[9] == mole_name: + return item[11] + else: + raise ValueError('impossible') + else: + sites_lst = [] + data = df.values.tolist() + for item in data: + if item[3] == mole_name: + sites_lst.append(item[5]) + elif item[9] == mole_name: + sites_lst.append(item[11]) + else: + ... + assert len(sites_lst) == len(data) + return sites_lst + + @classmethod + def split_chains(cls, file: Path, sid, out_filename=None, extract='auto', clean=True): + directory = Path(file).parent + if not directory.exists(): directory.mkdir(parents=True) + self = cls(file, sid) + file = cls.clean_pdb(file) if clean else file # keep ATOM line + auth_chainlist = self.getchain_str() + p = PDBParser(PERMISSIVE=1) + s = p.get_structure(self.sid, file) + file = Path(file) + if extract == 'auto': + for j in chainlist: + filename = f'{file.stem}_{j}.pdb' + self._split_chain_save_struct(obj=s[0][j], sid=self.sid, file=filename) + elif isinstance(extract, list): + for j in extract: + if j in auth_chainlist: + filename = f'{file.stem}_{j}.pdb' + self._split_chain_save_struct(obj=s[0][j], sid=self.sid, file=filename) + else: + print(f'chain {j} not exist') + elif isinstance(extract, str): + if extract in auth_chainlist: + filename = out_filename if out_filename else f'{file.stem}_{extract}.pdb' + self._split_chain_save_struct(obj=s[0][extract], sid=self.sid, file=filename) + else: + print(f'chain {extract} not exist') + else: + ... + return self + + def _split_chain_save_struct(self, obj, sid, file): + io = PDBIO() + io.set_structure(obj) + sidc = sid.replace('.pdb', '') if '.pdb' in sid else sid + if not Path(file).parent.exists(): Path(file).parent.mkdir(parents=True) + io.save(file.__str__(), ChainSelect(obj.id)) + return file + + @staticmethod + def clean_pdb(file, outfile=None): + """clean_pdb 清除杂原子,保留ATOM + + _extended_summary_ + + Keyword Arguments: + outfile {string} -- _description_ (default: {None}) + path {path} -- _description_ (default: {None}) + + Returns: + object -- path + """ + __lst = Bdp.read_line_list('ATOM', file) + _f = Path(outfile) if outfile else Path(file).parent.joinpath(f"{Path(file).stem}.clean.pdb") + if _f.exists(): _f.unlink() + with open(_f.__str__(), 'w+') as f: + for i in __lst: + f.write(f"{i}\n") + return _f + + def read_struct(self): + """read_struct read pdb structure + + https://biopython-cn.readthedocs.io/zh_CN/latest/cn/chr11.html#id3 + 一个 Structure 对象的整体布局遵循称为SMCRA(Structure/Model/Chain/Residue/Atom,结构/模型/链/残基/原子)的体系架构: + + 结构由模型组成 + 模型由多条链组成 + 链由残基组成 + 多个原子构成残基 + Returns: + [obj] -- [Model] + """ + p = PDBParser(PERMISSIVE=1) + s = p.get_structure(self.sid, self.path) + return s + + def get_chain(self, chain_str): + s = self.read_struct() + return s[0][chain_str] + + def read_formula(self): + """read_formula read pdb file molecule information and identifier in this pdb file(just like residue) + + [extended_summary] + + :raises OSError: Do not find this file + """ + _path = self.path + try: + with open(_path, 'r') as f: + text_line = f.readlines() + _l = [] + for i in text_line: + li = i.split() + if li: + if li[0] == 'FORMUL': + _l.append(li) + self.mole_id.append(li[2]) + for ii in _l: + self.mole_struct.append(''.join(ii[3:])) + except Exception as e: + raise e + + def mole_link(): + Bdp.transformtodataframe(first_label='LINK', ) + + @staticmethod + def transformtodataframe(first_label='ATOM', readlinecontent=None, path=False, keep_space: bool = True): + # https://www.wwpdb.org/documentation/file-format-content/format33/v3.3.html + path_info = (Path(path).name.__str__(), Path(path).parent.__str__()) if path else ( + 'unknown file name', 'unknown path') + if path: readlinecontent = Bdp.read_line_list(first_column=first_label, path=path) + if readlinecontent == None: raise ValueError('readlinecontent need') + if first_label == 'ATOM' or first_label == 'HETATM': # ! 注意python中的懒惰运算,及运算符特性 + # 将pdb每行读取的信息转化为dataframe 针对ATOM HETAM开头的行适用 + Identifier, AtomNum, AtomName, AtomLocationIndicator, ResidueName, ChainIndentifier, ResidueSequence, InsertionsResidue, X, Y, Z, Occupancy, Tfactor, SegmentIdentifier, ElementSymbol = [], [], [], [], [], [], [], [], [], [], [], [], [], [], [] + for i in readlinecontent: + if keep_space: # for restore to pdb file + Identifier.append(i[:7]) + AtomNum.append(i[6:12]) # 这里应该是切片6:12,但是PDB官网是6:11,导致AtomNum的值可能是错的,这里预留了一位数字,在超过10个原子时会出现两位数的情况 + AtomName.append(i[12:16]) + AtomLocationIndicator.append(i[16]) + ResidueName.append(i[17:20]) + ChainIndentifier.append(i[21]) + ResidueSequence.append(i[22:26]) + InsertionsResidue.append(i[26]) + X.append(i[30:38]) + Y.append(i[38:46]) + Z.append(i[46:54]) + Occupancy.append(i[54:60]) + Tfactor.append(i[60:66]) + SegmentIdentifier.append(i[72:76]) + ElementSymbol.append(i[76:78]) + else: + Identifier.append(i[:7].strip()) + AtomNum.append( + i[6:12].strip()) # 这里应该是切片6:12,但是PDB官网是6:11,导致AtomNum的值可能是错的,这里预留了一位数字,在超过10个原子时会出现两位数的情况 + AtomName.append(i[12:16].strip()) + AtomLocationIndicator.append(i[16].strip()) + ResidueName.append(i[17:20].strip()) + ChainIndentifier.append(i[21].strip()) + ResidueSequence.append(i[22:26].strip()) + InsertionsResidue.append(i[26].strip()) + X.append(float(i[30:38].strip())) + Y.append(float(i[38:46].strip())) + Z.append(float(i[46:54].strip())) + Occupancy.append(i[54:60].strip()) + Tfactor.append(i[60:66].strip()) + SegmentIdentifier.append(i[72:76].strip()) + ElementSymbol.append(i[76:78].strip()) + df = pd.DataFrame(data={ + 'Identifier': Identifier, + 'AtomNum': AtomNum, + 'AtomName': AtomName, + 'AtomLocationIndicator': AtomLocationIndicator, + 'ResidueName': ResidueName, + 'ChainIndentifier': ChainIndentifier, + 'ResidueSequence': ResidueSequence, + 'InsertionsResidue': InsertionsResidue, + 'X': X, + 'Y': Y, + 'Z': Z, + 'Occupancy': Occupancy, + 'Tfactor': Tfactor, + 'SegmentIdentifier': SegmentIdentifier, + 'ElementSymbol': ElementSymbol + }) + elif first_label == 'LINK': + # https://www.wwpdb.org/documentation/file-format-content/format33/sect6.html + Record_name, Atom_name1, Alternate_location_indicator1, Residue_name1, Chain_identifier1, Residue_sequence_number1, Insertion_code1, Atom_name2, Alternate_location_indicator2, Residue_name2, Chain_identifier2, Residue_sequence_number2, Insertion_code2, Symmetry_operator_atom_1, Symmetry_operator_atom_2, Link_distance = [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [] + for i in readlinecontent: + if len(i.strip()) != 78: + # logger.exception( + # f"[LINK label length error] not match the LINK line length\n" + # f"[input string]:{i}\n" + # f"check your input file:{path_info[0]} from {path_info[1]}\n" + # "if this line not have link distance, try to caculate by covalent.bypymol method") + pass + else: + Record_name.append(i[:6].strip()) + Atom_name1.append(i[12:16].strip()) + Alternate_location_indicator1.append(i[16].strip()) + Residue_name1.append(i[17:20].strip()) + Chain_identifier1.append(i[21].strip()) + Residue_sequence_number1.append(i[22:26].strip()) + Insertion_code1.append(i[26].strip()) + Atom_name2.append(i[42:46].strip()) + Alternate_location_indicator2.append(i[46].strip()) + Residue_name2.append(i[47:50].strip()) + Chain_identifier2.append(i[51].strip()) + Residue_sequence_number2.append(i[52:56].strip()) + Insertion_code2.append(i[56].strip()) + Symmetry_operator_atom_1.append(i[59:65].strip()) + Symmetry_operator_atom_2.append(i[66:72].strip()) + Link_distance.append(i[73:78].strip()) + df = pd.DataFrame(data={ + 'Record_name': Record_name, + 'Atom_name1': Atom_name1, + 'Alternate_location_indicator1': Alternate_location_indicator1, + 'Residue_name1': pd.Series(Residue_name1, dtype='object'), + 'Chain_identifier1': Chain_identifier1, + 'Residue_sequence_number1': Residue_sequence_number1, + 'Insertion_code1': Insertion_code1, + 'Atom_name2': Atom_name2, + 'Alternate_location_indicator2': Alternate_location_indicator2, + 'Residue_name2': pd.Series(Residue_name2, dtype='object'), + 'Chain_identifier2': Chain_identifier2, + 'Residue_sequence_number2': Residue_sequence_number2, + 'Insertion_code2': Insertion_code2, + 'Symmetry_operator_atom_1': Symmetry_operator_atom_1, + 'Symmetry_operator_atom_2': Symmetry_operator_atom_2, + 'Link_distance': pd.Series(Link_distance, dtype='float32'), + }) + else: + assert False, ( + 'No return', + 'a error occurred' + ) + return df + + @staticmethod + def dataframe2pdb(df, out_file): + """ + 将dataframe转化为pdb文件, support ATOM type: ATOM, HETATM + """ + with open(out_file, 'w') as f: + empyt_line = list(' ' * 80) + for i in df.iterrows(): + if i[1]['Identifier'].strip() == 'ATOM' or i[1]['Identifier'].strip() == 'HETATM': + c = i[1].tolist() + empyt_line[:7] = list(c[0]) + empyt_line[6:12] = list(c[1]) + empyt_line[12:16] = list(c[2]) + empyt_line[16] = list(c[3]) + empyt_line[17:20] = list(c[4]) + empyt_line[21] = list(c[5]) + empyt_line[22:26] = list(c[6]) + empyt_line[26] = list(c[7]) + empyt_line[30:38] = list(c[8]) + empyt_line[38:46] = list(c[9]) + empyt_line[46:54] = list(c[10]) + empyt_line[54:60] = list(c[11]) + empyt_line[60:66] = list(c[12]) + empyt_line[72:76] = list(c[13]) + empyt_line[76:78] = list(c[14]) + empyt_line[78:80] = [' ', ' '] + for i, j in enumerate(empyt_line): + if isinstance(j, list) and len(j) == 1: + empyt_line[i] = j[0] + cs = ''.join(empyt_line) + f.writelines(cs + '\n') + empyt_line = list(' ' * 80) + return out_file + + @staticmethod + def cleanpdb(before_string): + """ + 处理传入参数pdb 例如: pdb1b12.ent 1b12.pdb 转化成 1b12 + """ + # print(f'try to format {before_string}') + if '.ent' in before_string: + before_string = before_string[3:-4].lower() + elif '.pdb' in before_string: + before_string = before_string[:4].lower() + elif len(before_string) == 4: + before_string = before_string.lower() + else: + if len(before_string) != 4: + raise ValueError(f'length out of 4 {before_string}') + return before_string + + def modresdata(self) -> list: + """modresdata 标准残疾的修饰信息 + # 从SEQRES序列中获取MODRES的修饰残基的小分子,仅仅从SEQRES有时获取不全面 + 使用ModifiedResidues更加全面 + """ + modres_lst = self.read_line_list(first_column='MODRES', path=self.path) + lst = [i[12:15] for i in modres_lst] # 切片出修饰残基的小分子 + return list(set(lst)) + + def get_covalent(self): + # 从原始的pdb文件中可获取共价键信息 # ? 对其中的信息正确率为100% 来自原始的pdb文件中的信息 + res1 = self.link_df.query(f'Residue_name1 in {self.remove_modres_moleid}') + res2 = self.link_df.query(f'Residue_name2 in {self.remove_modres_moleid}') + df = pd.merge(res1, res2, how='outer') + df.astype('object') + # remove ions link with ligand # ! some molecule like 5H + # df = df[df['Residue_name1'].str.len() == 3] + # df = df[df['Residue_name2'].str.len() == 3] + return df + + @property + def ModifiedResidues(self): + ModifiedResiduesId = [] + for i in self.mole_id: + if i in self.modresdata(): + ModifiedResiduesId.append(i) + # 从SEQRES序列中获取MODRES的修饰残基的小分子,仅仅从SEQRES有时获取不全面 + seqres_lst = [i[19:].strip() for i in Bdp.read_line_list(first_column='SEQRES', path=self.path)] + seqres_lst_string = '\r\n'.join(seqres_lst) + seqres_lst_set = set(list(i for line in seqres_lst_string.splitlines() for i in line.split())) + seqres_list = seqres_lst_set.intersection(set(self.mole_id)) + ModifiedResiduesId.extend(seqres_list) + return list(set(ModifiedResiduesId)) + + @property + def remove_modres_moleid(self) -> list: + # 清除link信息中的modres,即修饰残基的ligid + # lst = Bdp.read_line_list(first_column='LINK',path=self.path) + # lst = list(set(map(lambda s:s[17:20],lst))) + clean_func = partial(self.func_dynamic_data_template, dynamic_data=self.ModifiedResidues) + res = clean_func(origin_data=self.mole_id) + return list(filter(lambda x: len(x.strip()) == 3 and x != 'HOH', res)) + + @staticmethod + def func_dynamic_data_template(origin_data: list, dynamic_data: list) -> list: + # 取origin_data和dynamic_data补集 + origin_data_set = set(origin_data) + dynamic_data_set = set(dynamic_data) + _data = origin_data_set.difference(dynamic_data_set) + return _data \ No newline at end of file diff --git a/vinautil/vinautil/vutils/pydock.py b/vinautil/vinautil/vutils/pydock.py new file mode 100644 index 0000000..abb477e --- /dev/null +++ b/vinautil/vinautil/vutils/pydock.py @@ -0,0 +1,98 @@ +#!/usr/bin/env python +# -*- encoding: utf-8 -*- +''' +@file :pydock +@Description: : pydock 调用相关类, 准备pydock配置文件 +@Date :2023/1/5 13:02:26 +@Author :lyzeng +@mail :pylyzeng@gmail.com +@version :1.0 +''' +from pathlib import Path +from dataclasses import dataclass, field, asdict +from typing import Optional, NoReturn, List, Union, Dict, Tuple + +''' +https://life.bsc.es/pid/pydock/doc/tutorial.html#setup-process +zdock配置参考: +对于蛋白小分子相互作用的配置参考: +在 [ligand] 部分中将 newmol 改为 B 是因为在 pyDock 输出文件中,配体的链名应与受体的链名不同。这是因为,在对齐过程中,配体可能会在受体上进行旋转和平移,并且在输出文件中,应为每个复合物配置提供单独的链名。因此,在这种情况下,将 newmol 改为 B 可确保在 pyDock 输出文件中,配体的链名与受体的链名不同。 +如果不这样做,pyDock 可能无法正常运行。例如,如果在 [ligand] 部分中将 newmol 保留为 A,则在输出文件中受体和配体的链名将相同,这可能会导致 pyDock 无法识别受体和配体。 +对接完成之后可以将小分子的链重新标记为A +[receptor] +pdb = 1C5K.pdb +mol = A +newmol = A +restr = A.His.246,A.Ala.249 + +[ligand] +pdb = 1OAP.pdb +mol = A +newmol = B +restr = B.Ala.88 + +对于蛋白蛋白相互作用的配置参考: +[receptor] +pdb = 1C5K.pdb +mol = A +newmol = A +restr = + +[ligand] +pdb = 1OAP.pdb +mol = B +newmol = B +restr = +''' + +@dataclass() +class pyDockBaseConfig(): + pdb: Path + mol: str + newmol: str + restr: (str, type(None)) = field(default=None) + +@dataclass +class pyDockConfigReceptor(pyDockBaseConfig): + ... + +@dataclass +class pyDockConfigLigand(pyDockBaseConfig): + ... + +@dataclass +class pyDockConfig(): + receptor: pyDockConfigReceptor + ligand: pyDockConfigLigand + + def __str__(self): + return f''' +pyDockConfig class + ''' + + def to_file(self, out_file: Path) -> NoReturn: + with open(out_file.as_posix(), 'w', encoding='utf-8') as f: + f.write(f''' +[receptor] +pdb = {self.receptor.pdb.as_posix()} +mol = {self.receptor.mol} +newmol = {self.receptor.newmol} +restr = {self.receptor.restr} + +[ligand] +pdb = {self.ligand.pdb.as_posix()} +mol = {self.ligand.mol} +newmol = {self.ligand.newmol} +restr = {self.ligand.restr} +''') + + def to_dict(self): + return asdict(self) + + +def testUnit(): + c = pyDockConfig(receptor=pyDockConfigReceptor(pdb=Path('312w.pdb'), mol='A', newmol='A', restr=None), + ligand=pyDockConfigLigand(pdb=Path('a12w.pdb'), mol='B', newmol='B', restr=None)) + c.to_file(Path('test.txt')) + return c + diff --git a/vinautil/vinautil/vutils/pymol.py b/vinautil/vinautil/vutils/pymol.py new file mode 100644 index 0000000..aa640e9 --- /dev/null +++ b/vinautil/vinautil/vutils/pymol.py @@ -0,0 +1,292 @@ +#!/usr/bin/env python +# -*- encoding: utf-8 -*- +''' +@file :pymol_api +@Description: : some pymol api remake +@Date :2023/1/1 15:06:35 +@Author :lyzeng +@mail :pylyzeng@gmail.com +@version :1.0 +''' +import sys +from pathlib import Path +from typing import Optional, Tuple, NoReturn, Dict, List, Iterable, Callable, Any +from pymol import cmd, finish_launching, rpc +from utils.typecheck import typeassert + +""" +cmd.centerofmass('organic and chain A') # 重心 +# finish_launching(['pymol','-R']) # without GUI pymol.finish_launching(['pymol', '-cq']) # for windows +""" + +class api(): + std_types = ['CYS', 'ILE', 'SER', 'VAL', 'GLN', 'LYS', 'ASN', 'PRO', 'THR', 'PHE', 'ALA', 'HIS', 'GLY', 'ASP', + 'LEU', 'ARG', 'TRP', 'GLU', 'TYR', 'MET', 'HID', 'HSP', 'HIE', 'HIP', 'CYX', 'CSS'] + regular_residue = [i.upper() for i in 'Gly,Ala,Val,Leu,Ile,Pro,Phe,Tyr,Trp,Ser,Thr,Cys,Met,Asn,Gln,Asp,Glu,Lys,Arg,His'.split(',')] + """注意: 在PyMOL中,"organic"和"hetatm"是有区别的,"organic"只包含碳氢化合物,而"hetatm"包含所有非蛋白质 / 核酸 / 多肽分子。 + "organic": 找到含碳的小分子,但不匹配已知的聚合物。使用方法:organic。例如:indicate organic。 + "all": 当前加载到 PyMOL 中的所有原子。 + "none": 空选择。 + "enabled": 启用对象中的原子。 + "sele": 命名选择或对象 "sele",但仅当它与其他操作符的名称不冲突时。 + "%sele": 命名选择或对象 "sele"。推荐使用,以避免模糊性。 + atom: selects all atoms + chain: selects atoms based on their chain identifier + element: selects atoms based on their element type (e.g. "C" for carbon, "H" for hydrogen) + resn: This selection name selects atoms based on their residue name (e.g. "resn ALA" selects all atoms in alanine residues). + resi: selects atoms based on their residue number + name: selects atoms based on their name (e.g. "CA" for alpha carbon) + hetatm: selects atoms that are part of HETATM records in a PDB file + polymer: selects atoms that are part of known polymers + solvent: selects atoms that are part of solvent molecules + organic: selects atoms that are part of carbon-containing molecules that do not match known polymers + inorganic: selects atoms that are not part of carbon-containing molecules or known polymers + br.(byres) all: This selection name selects all atoms that are within a certain distance of another atom or group of atoms (e.g. "br. all within 5 of organic" selects all atoms within 5 angstroms of any organic small molecules). + """ + + @staticmethod + def pymol_start(args: list, rpc:bool = False, **kwargs) -> NoReturn: + """start pymol + pymol -q: 启动无GUI的pymol,同时忽略pymolrc文件(如果存在)。 + pymol -cq: 启动无GUI的pymol,并忽略pymolrc文件(如果存在)。这是在windows上启动无GUI的pymol的常用方法。 + finish_launching(['pymol','-cq']) # for windows start pymol without GUI + finish_launching(['pymol','-c']) # for linux start pymol with GUI + MacOS: not support finish_launching + use subprocess.Popen(['pymol','-cq', '-R']) instead open XMLRPC server for interaction + default port: 9123 # not support change port + """ + start_cmd = { + 'win32': finish_launching, + 'linux': finish_launching, + 'darwin': 'MacOS not support finish_launching, use subprocess.Popen instead', + } + if isinstance(start_cmd[sys.platform], Callable): + start_cmd[sys.platform](args) + elif isinstance(start_cmd[sys.platform], str): + print(start_cmd[sys.platform]) + else: + raise Exception(f'Not support {sys.platform} platform!') + hostname = kwargs.get('hostname') if kwargs.get('hostname') else 'localhost' + _xmlPort = kwargs.get('port') if kwargs.get('port') else 9123 + _nPortsToTry = kwargs.get('nPortsToTry') if kwargs.get('nPortsToTry') else 5 + if rpc: + rpc.launch_XMLRPC(hostname, _xmlPort, _nPortsToTry) + + @staticmethod + def pymol_exit() -> NoReturn: + ''' + @Description: : exit pymol + :return: : NoReturn + ''' + cmd.quit() + + def load_file(self, file: Path = None, clean:bool = True) -> NoReturn: + cmd.delete("all") + file = file if file else self.file + cmd.load(file.as_posix()) + if clean: cmd.remove('solvent metals') # remove solvent and metals + def save_molecule(self, ligandid: str, chain:str, format: str = "mol2", file:Path = None) -> NoReturn: + ligandid, chain = ligandid.upper(), chain.upper() + cmd.delete("all") + cmd.select(f"{ligandid}", f"chain {chain} and resn {ligandid}") + if file is None: + file = self.file.parent.joinpath(f'{ligandid}_{chain}.{format}').as_posix() + else: + file = file.as_posix() + cmd.save(file, ligandid) + cmd.delete("all") # clean pymol window + + @staticmethod + def AddPolarHydrogens(selection:str, out_file: Path = None, fmt:str = 'mol2') -> Optional[str]: + """ + The file format is automatically chosen if the extesion is one of + the supported output formats: pdb, pqr, mol, sdf, pkl, pkla, mmd, out, + dat, mmod, cif, pov, png, pse, psw, aln, fasta, obj, mtl, wrl, dae, idtf, + or mol2. + :param selection: pymol grammar selector + :param out_file: output file + :return: + openbabel method: + omol = list(pybel.readfile(format = 'mol2', filename = file))[0] + omol.OBMol.AddPolarHydrogens() + omol.write('mol2',out_file,overwrite=True) + def mol2add(file,out_file,fmt='mol2'): + mols = pybel.readfile(fmt, file) + mol = next(mols) + mol.OBMol.AddPolarHydrogens() + mol.write(fmt,out_file,overwrite=True) + """ + cmd.delete("all") + cmd.h_add(selection) # add all hydrogens in this molecular + cmd.remove(f"{selection} & hydro & not nbr. (don.|acc.)") # remove no-polar-hydrogen + if out_file: + cmd.save(filename=out_file.as_posix(), selection=selection) + else: + return cmd.get_str(format=fmt, selection=selection) + @staticmethod + def pymol_get_sequence(resn:str, chain:str) -> List[str]: + """ + 获取链{chain}残基名称为{resn}的残基序列。 + """ + res = [] + cmd.iterate(selection=f'resn {resn} and chain {chain}', expression=lambda atom: res.append(atom.resi)) + res_seq = list(set(res)) + if len(res_seq) > 1: # 在同一条链上可以有两个以上相同的小分子 + print(f"Warning: resn {resn} and chain {chain} has more than one residue sequence: output {res_seq}") + return res_seq + + @staticmethod + def pymol_get_around(resn:str, chain:str, distance:float, extend: bool = True, add_self:bool = True) -> str: + """ + 获取链{chain}残基名称为{resn}的周围{distance}A的pdb字符。 + :param resn: 残基名称 + :param chain: 链 + :param distance: 距离 + :param add_self: 是否添加resn选择对象自身 + :return: pdb字符 +file = 'pdb7axn.ent.gz' + +resn = 'S6B' +chain = 'A' +distance = 3.00 +cmd.delete("all") +cmd.load(file.as_posix()) +cmd.remove('metal solvent') +cmd.select('sele', f'resn {resn} and chain {chain}') +cmd.center('sele') +cmd.orient('sele') +obj_name = 'around_and_self' +# cmd.create(obj_name, f'(resn {resn} and chain {chain}) around {distance}') # 不扩展残基 +# cmd.create(obj_name, f'byres (resn {resn} and chain {chain}) around {distance}') # 扩展残基 +# cmd.create(obj_name, f'(resn {resn} and chain {chain}) around {distance} or (resn {resn} and chain {chain})') # 选择resn的对象和resn对象周围3A的原子 +cmd.create(obj_name, f'byres (resn {resn} and chain {chain}) around {distance} or (resn {resn} and chain {chain})') # 选择resn的对象和resn对象周围3A的原子扩展至残基的原子 +cmd.hide('everything', 'all') +cmd.show('lines', obj_name) +pdbstr = cmd.get_pdbstr(selection=obj_name) +print(pdbstr) + """ + obj_name = f'chain_{chain}_{resn}_around_{round(distance,2)}' + if add_self: + if extend: + cmd.create(obj_name, + f'byres (resn {resn} and chain {chain}) around {distance} or (resn {resn} and chain {chain})') + else: + cmd.create(obj_name, f'(resn {resn} and chain {chain}) around {distance} or (resn {resn} and chain {chain})') + else: + if extend: + cmd.create(obj_name, f'byres (resn {resn} and chain {chain}) around {distance}') + else: + cmd.create(obj_name, f'(resn {resn} and chain {chain}) around {distance}') + pdb_str = cmd.get_pdbstr(selection=obj_name) + cmd.delete(obj_name) # delete the object + return pdb_str + @staticmethod + def pymol_get_centerofmass(object_name) -> List[float]: + """ + 获取 pymol 中对象的重心。 + + 这是一个用来获取 pymol 中对象的重心的函数坐标 + + :param object_name: 对象的名称 + :type object_name: str,optional + :return: 重心的坐标。 + """ + return cmd.centerofmass(object_name) + + @staticmethod + def pymol_print_exec(object_name: str = 'organic', atom_attribute: str = 'resn') -> List[str]: + """ + 打印 pymol 中对象的属性。 + + 这是一个用来打印 pymol 中对象的属性的函数。 + + :param object_name: 对象的名称,默认为 'organic'。 + :type object_name: str,optional + :param atom_attribute: 属性的名称,默认为 'resn'。 + Optional: ['ID', 'alt', 'b', 'cartoon', 'chain', 'color', + 'elec_radius', 'elem', 'flags', 'formal_charge', + 'geom', 'index', 'label', 'model', 'name', 'numeric_type', + 'oneletter', 'p', 'partial_charge', 'protons', 'q', 'rank', 'reps', 'resi', + 'resn', 'resv', 's', 'segi', 'ss', 'stereo', 'text_type', 'type', 'valence', 'vdw'] + 'p' is not supported. 's' is wrapper.SettingWrapper object + :type atom_attribute: str,optional + :return: 所选对象的数量和属性的值的列表。 + """ + local_namespace = {'res': [], 'cmd': cmd} + CMD = f"cmd.iterate(selection='{object_name}', expression=lambda atom: res.append(atom.{atom_attribute}))" + exec(CMD, local_namespace) + return local_namespace['res'] + + @staticmethod + def get_coords(selection: str): + return cmd.get_coords(selection) + @staticmethod + def pymol_get_resn(object_name: str = 'all') -> Tuple[str]: + resn_list = [] + cmd.iterate(selection=object_name, expression=lambda atom: resn_list.append(atom.resn)) + return tuple(set(resn_list)) + + @staticmethod + def Mutagenesis_site(filename: Path, mutation_type: str, site: int, outfile: Path = None) -> Path: + """Mutagenesis_site Mutagenesis residue in site + residue site have mutil conformations, need to select one conformations, some error accured. + pass + _extended_summary_ + + Arguments: + filename {str} -- PDB file format + mutation_type {str} -- 'VAL' for ALA TO VAL; 'ALA' for any/not ALA to ALA; 'GLY' for ALA to GLY + site {int} -- residue site in pdbfile + + Keyword Arguments: + outfile {str} -- _description_ (default: {None}) + + Raises: + ValueError: not one object in PDBs,need to fix + + Returns: + str -- save mutagenesis file path + """ + p = Path(filename) + savename = p.stem + f"_{site}_mutation.pdb" + _out_file = Path(outfile) if outfile else p.absolute().parent.joinpath(savename) + if not _out_file.absolute().parent.exists(): _out_file.absolute().parent.mkdir(parents=True) + cmd.delete('all') # ! clean up + cmd.load(filename) + PDBs = cmd.get_names() + # Get the ID numbers of c-alpha (CA) atoms of all residues + if len(PDBs) == 1: + PDB = PDBs[0] + else: + raise ValueError(f'this pdb have more than one object!PDBs:{PDBs}') + CAindex = cmd.identify(f"{PDB} and name CA") + pdbstrList = [cmd.get_pdbstr("%s and id %s" % (PDB, CAid)).splitlines() for CAid in CAindex] + ProtChainResiList = [[PDB, i[0][21], i[0][22:26].strip()] for i in pdbstrList] + for i, j, k in ProtChainResiList: + if int(k) == int(site): + cmd.wizard("mutagenesis") + # print(i,j,k) + cmd.refresh_wizard() + cmd.get_wizard().set_mode(mutation_type) + ##Possible mutation_type could be: + ##'VAL' for ALA TO VAL + ##'ALA' for any/not ALA to ALA + ##'GLY' for ALA to GLY + # 'selection' will select each residue that you have selected + # on ProtChainResiList above using the PDBid,chain,and residue + # present on your pdb file.If you didn't select a range on + # ProteinChainResiList, it will do the mutation on all the residues + # present in your protein. + selection = f"/{i}//{j}/{k}" + # Print selection to check the output + # print(selection) + # Selects where to place the mutation + cmd.get_wizard().do_select(selection) + ##Applies mutation + cmd.get_wizard().apply() + # Save each mutation and reinitialize the session before the next mutation + ##to have pdb files only with the residue-specific single-point mutation you were interested. + cmd.set_wizard("done") + cmd.save(_out_file.as_posix(), f"{PDB}") + cmd.delete('all') # Reinitialize PyMOL to default settings. + return _out_file \ No newline at end of file diff --git a/vinautil/vinautil/vutils/pymol_test.py b/vinautil/vinautil/vutils/pymol_test.py new file mode 100644 index 0000000..7988b80 --- /dev/null +++ b/vinautil/vinautil/vutils/pymol_test.py @@ -0,0 +1,98 @@ +#!/usr/bin/env python +# -*- encoding: utf-8 -*- + +from pathlib import Path +from pymol import cmd, finish_launching +import ast +from typing import Optional, Tuple, NoReturn, Dict, List, Iterable, Callable, Any + +def get_resn(name:str = 'all'): + resn_list = [] + cmd.iterate(selection=name, expression=lambda atom: resn_list.append(atom.resn)) + return tuple(set(resn_list)) + +def pymol_print(object_name: str = 'hetatm', atom_attribute: str = 'resn') -> Tuple[int, List[str]]: + """ + 打印 pymol 中对象的属性。 + + 这是一个用来打印 pymol 中对象的属性的函数。 + + :param object_name: 对象的名称,默认为 'hetatm'。 + :type object_name: str,optional + :param atom_attribute: 属性的名称,默认为 'resn'。 + Optional: ['ID', 'alt', 'b', 'cartoon', 'chain', 'color', + 'elec_radius', 'elem', 'flags', 'formal_charge', + 'geom', 'index', 'label', 'model', 'name', 'numeric_type', + 'oneletter', 'p', 'partial_charge', 'protons', 'q', 'rank', 'reps', 'resi', + 'resn', 'resv', 's', 'segi', 'ss', 'stereo', 'text_type', 'type', 'valence', 'vdw'] + :type atom_attribute: str,optional + :return: 所选对象的数量和属性的值的列表。 + """ + local_namespace = {'res': [], 'cmd': cmd}, + cmd_obj = ast.literal_eval( + f"cmd.iterate(selection='{object_name}', expression=lambda atom: res.append(atom.{atom_attribute}))" + ) + select_num = eval(cmd_obj, local_namespace) + return (select_num, res) + +def pymol_print_exec(object_name: str = 'hetatm', atom_attribute: str = 'resn') -> Tuple[int, List[str]]: + """ + 打印 pymol 中对象的属性。 + + 这是一个用来打印 pymol 中对象的属性的函数。 + + :param object_name: 对象的名称,默认为 'hetatm'。 + :type object_name: str,optional + :param atom_attribute: 属性的名称,默认为 'resn'。 + Optional: ['ID', 'alt', 'b', 'cartoon', 'chain', 'color', + 'elec_radius', 'elem', 'flags', 'formal_charge', + 'geom', 'index', 'label', 'model', 'name', 'numeric_type', + 'oneletter', 'p', 'partial_charge', 'protons', 'q', 'rank', 'reps', 'resi', + 'resn', 'resv', 's', 'segi', 'ss', 'stereo', 'text_type', 'type', 'valence', 'vdw'] + 'p' is not supported. 's' is wrapper.SettingWrapper object + :type atom_attribute: str,optional + :return: 所选对象的数量和属性的值的列表。 + """ + local_namespace = {'res': [], 'cmd': cmd} + CMD = f"cmd.iterate(selection='{object_name}', expression=lambda atom: res.append(atom.{atom_attribute}))" + exec(CMD, local_namespace) + return local_namespace['res'] + +def delete(i_cmd: cmd = None): + if i_cmd: cmd = i_cmd + if cmd is i_cmd: + print('same object') + cmd.delete("all") + +if __name__ == '__main__': + launch_res = finish_launching(['pymol', '-R']) # default port 9123 localhost + print(launch_res) + PDBdata = list(Path('C:\\Users\\Admin\\program\\scarversion\\dock20221224\\PDBdatabase20221231').rglob('*.ent.gz')) + cmd.load(PDBdata[1234]) + cmd.remove('metal solvent') + hetatm_lst = get_resn('hetatm') + organic_lst = get_resn('organic') + print(f'hetatm: {hetatm_lst}') + print(f'organic: {organic_lst}') + + attribute_lst = ['ID', 'alt', 'b', 'cartoon', 'chain', 'color', 'elec_radius', 'elem', 'flags', 'formal_charge', 'geom', 'index', 'label', 'model', 'name', 'numeric_type', 'oneletter', 'partial_charge', 'protons', 'q', 'rank', 'reps', 'resi', 'resn', 'resv', 's', 'segi', 'ss', 'stereo', 'text_type', 'type', 'valence', 'vdw'] + res = [] + cmd.iterate(selection='hetatm', expression=lambda atom: res.append(atom.ID)) + print(f"{res[0]}----{len(res)}") + for attribute in attribute_lst: + print(attribute) + res = pymol_print_exec('hetatm', attribute) + print(f"{res[0]}----{len(res)}") + res = [] + res = pymol_print_exec('hetatm', '') + print(res) + cmd.get_coords('hetatm') + cmd.set_color() + + + + + + + + diff --git a/vinautil/vinautil/vutils/pyrosettafix.py b/vinautil/vinautil/vutils/pyrosettafix.py new file mode 100644 index 0000000..e25ece1 --- /dev/null +++ b/vinautil/vinautil/vutils/pyrosettafix.py @@ -0,0 +1,61 @@ +#!/usr/bin/env python +# -*- encoding: utf-8 -*- +''' +@file :pyrosettafix +@Description: : +@Date :2023/3/11 16:54:38 +@Author :lyzeng +@mail :pylyzeng@gmail.com +@version :1.0 +''' + +# 使用pyrosetta快速修复蛋白缺失的侧链 +from pathlib import Path +import pyrosetta +from pyrosetta import rosetta +from multiprocessing import Pool +from pyrosetta.toolbox import cleanATOM +import argparse, os, sys + + +def fix_optimize(file: Path, out_file: Path): + ''' + FastRelax使用快速梯度下降算法,可以在较短时间内对蛋白质进行优化,并且对于结构中的非构象缺陷,例如氢键、离子对、芳香性相互作用和溶剂-蛋白质相互作用等进行优化。 + ref2015是Rosetta程序包中的一个分数函数,它是Rosetta2015中引入的一个新的蛋白质力场,用于蛋白质结构预测和设计。这个力场是从先前的Rosetta力场中提炼出来的,经过了一系列的校正和优化,可以更好地预测蛋白质的折叠构象。在FastRelax中,ref2015可以作为一个可选参数来指定使用哪个力场来进行优化。 + :param file: + :param out_file: + :return: + ''' + # 使用pyrosetta修复蛋白结构 + cleanATOM(file.as_posix()) + file = file.parent.joinpath(f'{file.stem}.clean{file.suffix}') + # 初始化PyRosetta + pyrosetta.init() + # 读入蛋白质结构 + pose = pyrosetta.pose_from_pdb(file.as_posix()) + # fix residue side chain + scorefxn = pyrosetta.create_score_function('ref2015') + relax = pyrosetta.rosetta.protocols.relax.FastRelax(scorefxn) + relax.apply(pose) + # 输出修复后的结构 + pose.dump_pdb(out_file.as_posix()) + +def main_multi(p): + dire = Path(p) + if not dire.exists(): raise NotADirectoryError(f'Not found: {p.as_posix()}') + files = list(dire.rglob('*.pdb')) + + # 创建进程池 + with Pool() as pool: + # 将fix_optimize()函数应用到每个文件 + results = [pool.apply_async(fix_optimize, args=(file, file.parent.joinpath(file.name.split('.')[0] + '_relaxed.pdb'))) for file in files] + # 等待所有进程完成 + for result in results: + result.wait() + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='Optimize pdb struct') + parser.add_argument('-p', '--pdb_dir', metavar="PDB files directory (.pdb format)", nargs='?', + default=sys.stdin, help='receptors directory') + args = parser.parse_args() + main_multi(args.pdb_dir) \ No newline at end of file diff --git a/vinautil/vinautil/vutils/read.py b/vinautil/vinautil/vutils/read.py new file mode 100644 index 0000000..33f5d17 --- /dev/null +++ b/vinautil/vinautil/vutils/read.py @@ -0,0 +1,52 @@ +#!/usr/bin/env python +# -*- encoding: utf-8 -*- +''' +@file :read +@Description: : read cheminfo file +@Date :2023/4/9 11:14:29 +@Author :lyzeng +@mail :pylyzeng@gmail.com +@version :1.0 +''' + +import re +import pandas as pd +from utils.pdbparse_module import Bdp + +def read_mol2_file(file_path): + # 读取文件内容 + with open(file_path, 'r') as file: + content = file.read() + # 使用正则表达式匹配@ATOM和下一个@之间的内容 + atom_section = re.search(r'@ATOM(.*?)@', content, re.DOTALL) + # 如果找到匹配项,提取原子记录并将其分割成单独的行 + if atom_section: + atom_records = atom_section.group(1).strip().split('\n') + else: + raise ValueError('未找到@ATOM节') + # 将原子记录分割成单独的字段,并将其保存为pandas DataFrame + atom_data = [record.split() for record in atom_records] + df = pd.DataFrame(atom_data, columns=['atom_id', 'atom_name', 'x', 'y', 'z', 'atom_type', 'subst_id', 'subst_name', 'charge']) + return df + +def get_mol_covalent_atom_coords(df:pd.DataFrame, search_atom_name:str): # locate the atom coords in the mol2 file + df_c = df[df['atom_name'] == search_atom_name] + if len(df_c.index) == 1: + ind = df_c.index[0] + x,y,z = df_c.loc[ind, 'x'], df_c.loc[ind, 'y'], df_c.loc[ind, 'z'] + return x,y,z + +def get_rec_covalent_atom_coords(pdbfile, resname: str, chain: str, resseq: str, atomname: str): + # locate the atom coords in the pdb file + df = Bdp.clean_struct(file=pdbfile, label='ATOM') + # 去除所有字符串列中的前后空格 + df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x) + bdp = df + # keep locate the atom coords failed in the pdb file, clean alpha construct in the pdb file + df_c = bdp[(bdp['ResidueName'] == resname.upper()) & (bdp['ChainIndentifier'] == chain.upper()) & (bdp['ResidueSequence'] == str(resseq)) & (bdp['AtomName'] == atomname)] + if len(df_c.index) == 1: + ind = df_c.index[0] + x,y,z = df_c.loc[ind, 'X'], df_c.loc[ind, 'Y'], df_c.loc[ind, 'Z'] + return x,y,z + + diff --git a/vinautil/vinautil/vutils/spyrmsd_load.py b/vinautil/vinautil/vutils/spyrmsd_load.py new file mode 100644 index 0000000..67f978b --- /dev/null +++ b/vinautil/vinautil/vutils/spyrmsd_load.py @@ -0,0 +1,95 @@ +from spyrmsd.molecule import Molecule +from spyrmsd import io, rmsd +from spyrmsd.exceptions import NonIsomorphicGraphs +from numpy import nan as np_nan +from spyrmsd.optional.obabel import to_molecule +from openbabel import pybel +from typing import List + +def symmrmsd_mol2_list(mol2_ref: str, mol2_docked: List[str], minimize=False) -> float: + ref = to_molecule(pybel.readstring('mol2',mol2_ref)) + mol_lst = [to_molecule(pybel.readstring('mol2',i)) for i in mol2_docked] + mol = mol_lst[0] + ref.strip() # remove Hydrogen atom + mol.strip() # remove Hydrogen atom + coords_ref = ref.coordinates + anum_ref = ref.atomicnums + adj_ref = ref.adjacency_matrix + coords = [mol.coordinates for mol in mol_lst] + anum = mol.atomicnums + adj = mol.adjacency_matrix + try: + RMSD = rmsd.symmrmsd( + coords_ref, + coords, + anum_ref, + anum, + adj_ref, + adj, + minimize=minimize + ) + return RMSD + except NonIsomorphicGraphs: + return np_nan + +def rmsd_mol2_list(mol2_ref: str, mol2_docked: List[str], minimize=False) -> List[float]: + ref = to_molecule(pybel.readstring('mol2',mol2_ref)) + ref.strip() # remove Hydrogen atom + coords_ref = ref.coordinates + mol_ref_atomnum = ref.atomicnums + + RMSDs = [] + for mol2 in mol2_docked: + try: + mol = to_molecule(pybel.readstring('mol2',mol2)) + mol.strip() # remove Hydrogen atom + RMSD = rmsd.rmsd( + coords1=coords_ref, + coords2=mol.coordinates, + atomicn1=mol_ref_atomnum, + atomicn2=mol.atomicnums, + minimize=minimize + ) + RMSDs.append(RMSD) + except: + RMSDs.append(np_nan) + return RMSDs + +def symmrmsd_mol2(mol2_ref:str, mol2_docked:str, minimize=False) -> float: + ref = to_molecule(pybel.readstring(mol2_ref, 'mol2')) + mol = to_molecule(pybel.readstring(mol2_docked, 'mol2')) + ref.strip() # remove Hydrogen atom + mol.strip() # remove Hydrogen atom + coords_ref = ref.coordinates + anum_ref = ref.atomicnums + adj_ref = ref.adjacency_matrix + coords = [mol.coordinates] + anum = mol.atomicnums + adj = mol.adjacency_matrix + try: + RMSD = rmsd.symmrmsd( + coords_ref, + coords, + anum_ref, + anum, + adj_ref, + adj, + minimize=minimize + ) + return RMSD[0] + except NonIsomorphicGraphs: + return np_nan + +def rmsd_mol2(mol2_ref: str, mol2_docked: str, minimize=False) -> float: + mol_ref = to_molecule(pybel.readstring(mol2_ref, 'mol2')) + mol_docked = to_molecule(pybel.readstring(mol2_docked, 'mol2')) + mol_ref.strip() # remove Hydrogen atom + mol_docked.strip() # remove Hydrogen atom + RMSD = rmsd.rmsd( + coords1=mol_ref.coordinates, + coords2=mol_docked.coordinates, + atomicn1=mol_ref.atomicnums, + atomicn2=mol_docked.atomicnums, + minimize=minimize + ) + return RMSD \ No newline at end of file diff --git a/vinautil/vinautil/vutils/typecheck.py b/vinautil/vinautil/vutils/typecheck.py new file mode 100644 index 0000000..77cb859 --- /dev/null +++ b/vinautil/vinautil/vutils/typecheck.py @@ -0,0 +1,41 @@ +#!/usr/bin/env python +# -*- encoding: utf-8 -*- +''' +@file :typecheck.py +@Description: : +@Date :2022/12/28 15:17:33 +@Author :hotwa +@version :1.1 +''' + + +# Descriptor for a type-checked attribute +class Typed: + def __init__(self, name, expected_type): + self.name = name + self.expected_type = expected_type + + def __get__(self, instance, cls): + if not instance: + return self + else: + return instance.__dict__[self.name] + + def __set__(self, instance, value): + if not isinstance(value, self.expected_type): + raise TypeError(f'Expected {str(self.expected_type)}, not {type(value)}') + instance.__dict__[self.name] = value + + def __delete__(self, instance): + del instance.__dict__[self.name] + + +# Class decorator that applies it to selected attributes +def typeassert(**kwargs): + def decorate(cls): + for name, expected_type in kwargs.items(): + # Attach a Typed descriptor to the class + setattr(cls, name, Typed(name, expected_type)) + return cls + + return decorate \ No newline at end of file diff --git a/vinautil/vinautil/vutils/vinabox2ledockbox.py b/vinautil/vinautil/vutils/vinabox2ledockbox.py new file mode 100644 index 0000000..05bde74 --- /dev/null +++ b/vinautil/vinautil/vutils/vinabox2ledockbox.py @@ -0,0 +1,13 @@ +#!/usr/bin/env python +# -*- encoding: utf-8 -*- +''' +@file :vinabox2ledockbox +@Description: : +@Date :2023/2/22 14:18:51 +@Author :lyzeng +@mail :pylyzeng@gmail.com +@version :1.0 +''' + + + diff --git a/vinautil/vinautil/vutils/vinadock.py b/vinautil/vinautil/vutils/vinadock.py new file mode 100644 index 0000000..182737a --- /dev/null +++ b/vinautil/vinautil/vutils/vinadock.py @@ -0,0 +1,19 @@ +from pathlib import Path +from vina import Vina + +def dockvina(receptor, ligand, center, box_size, exhaustiveness=32,n_poses=20,out_n_poses = 5): + out_stem = {Path(receptor).stem}--{Path(ligand).stem} + v = Vina(sf_name='vina') + v.set_receptor(receptor) + v.set_ligand_from_file(ligand) + v.compute_vina_maps(center=center, box_size=box_size) + # Score the current pose + energy = v.score() + print('Score before minimization: %.3f (kcal/mol)' % energy[0]) + # Minimized locally the current pose + energy_minimized = v.optimize() + print('Score after minimization : %.3f (kcal/mol)' % energy_minimized[0]) + v.write_pose(f'{out_stem}_minimized.pdbqt', overwrite=True) + # Dock the ligand + v.dock(exhaustiveness=exhaustiveness, n_poses=n_poses) + v.write_poses(f'{out_stem}.pdbqt', n_poses=out_n_poses, overwrite=True) \ No newline at end of file