Files
analysis_pdb/diff.ipynb
2024-03-06 17:14:49 +08:00

618 lines
16 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
{
"cells": [
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"MD_finish = '''1ao7\n",
"1mi5\n",
"1oga\n",
"2vlj\n",
"2vlk\n",
"3o41\n",
"4ftv\n",
"1bd2\n",
"1fo0\n",
"1j8h\n",
"inam\n",
"2bnr\n",
"2iam\n",
"2ol3\n",
"2p5e\n",
"2z31\n",
"3h9s\n",
"3hg1\n",
"3kps\n",
"3pwp\n",
"3qdg\n",
"3qdm\n",
"3qib\n",
"3vxm\n",
"3vxs\n",
"4jrx\n",
"5brz\n",
"5bs0\n",
"4g9f\n",
"5nht\n",
"5nqk\n",
"5isz\n",
"3gsn\n",
"1d9k\n",
"1u3h\n",
"3dxa\n",
"3ffc\n",
"3kpr\n",
"3sjv\n",
"3utt\n",
"4eup\n",
"4h1l\n",
"4mji\n",
"4qrp\n",
"5c0a\n",
"4ozf\n",
"4pri\n",
"5eu6\n",
"5ivx\n",
"5m02\n",
"5men\n",
"5tez\n",
"5yxn\n",
"6am5\n",
"6amu\n",
"6avf\n",
"6cql\n",
"6g9q\n",
"6mtm\n",
"6px6\n",
"6py2\n",
"6r0e\n",
"6r2l\n",
"6rpa\n",
"6tro\n",
"6vrm\n",
"6vrn\n",
"6zkw\n",
"7dzm\n",
"7jwj\n",
"7n1e\n",
"7n1f\n",
"7n2n\n",
"7na5\n",
"7ndq\n",
"7nme\n",
"7ow5\n",
"7qpj\n",
"7rtr\n",
"7sg0\n",
"7t2c\n",
"8gom\n",
"8gvb\n",
"1zgl\n",
"1g6r\n",
"3qiu\n",
"4p2o\n",
"6u3n\n",
"2ypl'''"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"run_MD = set(MD_finish.splitlines())"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"89"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(run_MD)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/tmp/ipykernel_192237/2850835352.py:1: DeprecationWarning: \n",
"Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),\n",
"(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)\n",
"but was not found to be installed on your system.\n",
"If this would cause problems for you,\n",
"please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466\n",
" \n",
" import pandas as pd\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"<bound method NDFrame.head of pdb_id MHC_class TCR_alpha TCR_beta peptide MHC_alpha MHC_beta Beta2m \\\n",
"0 1d9k 2 A B P C D NaN \n",
"1 1g6r 1 A B P H NaN L \n",
"2 1kj2 1 A B P H NaN L \n",
"3 1mwa 1 A B P H NaN L \n",
"4 1u3h 2 E F I G H NaN \n",
".. ... ... ... ... ... ... ... ... \n",
"141 7sg0 2 D E C A B NaN \n",
"142 7t2c 2 D E C A B NaN \n",
"143 8gom 1 D E C A NaN B \n",
"144 8gvb 1 A B P H NaN L \n",
"145 7phr 1 A B P H NaN L \n",
"\n",
" protein_type DOI ... No. of bonds Polar Surface Area \\\n",
"0 m 10.7554/eLife.82934 ... NaN NaN \n",
"1 m 10.7554/eLife.82916 ... 149.0 415.57 \n",
"2 m 10.7554/eLife.82914 ... NaN NaN \n",
"3 m 10.7554/eLife.82912 ... NaN NaN \n",
"4 m 10.7554/eLife.82935 ... NaN NaN \n",
".. ... ... ... ... ... \n",
"141 s 10.1093/nar/gkad398 ... NaN NaN \n",
"142 s 10.1093/nar/gkad401 ... NaN NaN \n",
"143 s 10.1093/nar/gkad386 ... NaN NaN \n",
"144 s 10.1093/nar/gkad383 ... NaN NaN \n",
"145 m 10.1093/nar/gkad380 ... NaN NaN \n",
"\n",
" XLOGP3 open banel LogP HB donor HB acceptor Rotatable bonds \\\n",
"0 NaN NaN NaN NaN NaN \n",
"1 1.91 -0.09 12.0 13.0 40.0 \n",
"2 NaN NaN NaN NaN NaN \n",
"3 NaN NaN NaN NaN NaN \n",
"4 NaN NaN NaN NaN NaN \n",
".. ... ... ... ... ... \n",
"141 NaN NaN NaN NaN NaN \n",
"142 NaN NaN NaN NaN NaN \n",
"143 NaN NaN NaN NaN NaN \n",
"144 NaN NaN NaN NaN NaN \n",
"145 NaN NaN NaN NaN NaN \n",
"\n",
" Canonical SMILES Unnamed: 49 \\\n",
"0 NaN NaN \n",
"1 CC[C@@H]([C@@H](C(=O)N[C@H](C(=O)N[C@H](C(=O)N... NaN \n",
"2 NaN NaN \n",
"3 NaN NaN \n",
"4 NaN NaN \n",
".. ... ... \n",
"141 NaN NaN \n",
"142 NaN NaN \n",
"143 NaN NaN \n",
"144 NaN NaN \n",
"145 NaN NaN \n",
"\n",
" others \n",
"0 NaN \n",
"1 NaN \n",
"2 NaN \n",
"3 B链116117动画缺失文件正常 \n",
"4 NaN \n",
".. ... \n",
"141 NaN \n",
"142 NaN \n",
"143 NaN \n",
"144 NaN \n",
"145 有peptides的结果中是否存在序列数目大于5的结果: 1 \\r\\n ['7phr'] \n",
"\n",
"[146 rows x 51 columns]>\n"
]
}
],
"source": [
"import pandas as pd\n",
"\n",
"# 假设Excel文件的路径为'path_to_excel_file.xlsx'\n",
"file_path = 'TCR-pMHC4.xlsx'\n",
"\n",
"# 读取名为'sheet1'的工作表,第一行作为列名\n",
"df = pd.read_excel(file_path, sheet_name='Sheet1', header=0, engine='calamine')\n",
"\n",
"# 显示数据帧以确认加载正确\n",
"print(df.head)"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"column_set = set(df['pdb_id'])"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"146"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(column_set)"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"89"
]
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(run_MD)"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [],
"source": [
"diff = column_set - run_MD"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"59"
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(diff)"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"110"
]
},
"execution_count": 24,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(df[df['MHC_class'] == 1])"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"36"
]
},
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(df[df['MHC_class'] == 2])"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'1kj2',\n",
" '1mwa',\n",
" '1nam',\n",
" '2nx5',\n",
" '2vlr',\n",
" '3mbe',\n",
" '3mv7',\n",
" '3mv8',\n",
" '3mv9',\n",
" '3o4l',\n",
" '3pqy',\n",
" '4e41',\n",
" '4gg6',\n",
" '4ozi',\n",
" '4p2q',\n",
" '4p2r',\n",
" '4z7u',\n",
" '4z7v',\n",
" '4z7w',\n",
" '5c07',\n",
" '5c08',\n",
" '5c09',\n",
" '5c0b',\n",
" '5c0c',\n",
" '5d2l',\n",
" '5d2n',\n",
" '5e6i',\n",
" '5e9d',\n",
" '5euo',\n",
" '5jhd',\n",
" '5ks9',\n",
" '5ksa',\n",
" '5ksb',\n",
" '5nme',\n",
" '5nmg',\n",
" '5wkh',\n",
" '5wlg',\n",
" '6avg',\n",
" '6bga',\n",
" '6bj2',\n",
" '6rp9',\n",
" '6rpb',\n",
" '6rsy',\n",
" '6u3o',\n",
" '6uk4',\n",
" '6uln',\n",
" '6uon',\n",
" '6vm8',\n",
" '6vmx',\n",
" '6vqo',\n",
" '7l1d',\n",
" '7n6e',\n",
" '7phr',\n",
" '7rdv',\n",
" '7rk7',\n",
" '7rrg',\n",
" '7sg1',\n",
" '7t2b',\n",
" '7z50'}"
]
},
"execution_count": 26,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"diff"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {},
"outputs": [],
"source": [
"!rm -rf ./fixed/*"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {},
"outputs": [],
"source": [
"!cp single_polymeric_fix/*.pdb ./fixed\n",
"!cp multimer_polymeric_fix/*.pdb ./fixed"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"146"
]
},
"execution_count": 36,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from pathlib import Path\n",
"len(list(Path('fixed').glob('*.pdb')))"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {},
"outputs": [],
"source": [
"!mkdir -p notrunMD"
]
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {},
"outputs": [],
"source": [
"from pathlib import Path\n",
"import shutil\n",
"\n",
"for i in diff:\n",
" shutil.copy(Path('./fixed').joinpath(f'{i}.modellerfix.pdb').as_posix(), Path('./notrunMD').joinpath(f'{i}.modellerfix.pdb').as_posix())"
]
},
{
"cell_type": "code",
"execution_count": 40,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"59"
]
},
"execution_count": 40,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(list(Path('./notrunMD').glob('*.pdb')))"
]
},
{
"cell_type": "code",
"execution_count": 39,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"notrunMD/\n",
"notrunMD/6uln.modellerfix.pdb\n",
"notrunMD/5ks9.modellerfix.pdb\n",
"notrunMD/1mwa.modellerfix.pdb\n",
"notrunMD/4z7w.modellerfix.pdb\n",
"notrunMD/7rdv.modellerfix.pdb\n",
"notrunMD/4z7u.modellerfix.pdb\n",
"notrunMD/5euo.modellerfix.pdb\n",
"notrunMD/5c08.modellerfix.pdb\n",
"notrunMD/6bga.modellerfix.pdb\n",
"notrunMD/1nam.modellerfix.pdb\n",
"notrunMD/5c07.modellerfix.pdb\n",
"notrunMD/7l1d.modellerfix.pdb\n",
"notrunMD/6vm8.modellerfix.pdb\n",
"notrunMD/7rk7.modellerfix.pdb\n",
"notrunMD/1kj2.modellerfix.pdb\n",
"notrunMD/6rpb.modellerfix.pdb\n",
"notrunMD/5nme.modellerfix.pdb\n",
"notrunMD/3mbe.modellerfix.pdb\n",
"notrunMD/6avg.modellerfix.pdb\n",
"notrunMD/5jhd.modellerfix.pdb\n",
"notrunMD/4p2r.modellerfix.pdb\n",
"notrunMD/3pqy.modellerfix.pdb\n",
"notrunMD/5e9d.modellerfix.pdb\n",
"notrunMD/3mv7.modellerfix.pdb\n",
"notrunMD/3mv9.modellerfix.pdb\n",
"notrunMD/2nx5.modellerfix.pdb\n",
"notrunMD/6rsy.modellerfix.pdb\n",
"notrunMD/7phr.modellerfix.pdb\n",
"notrunMD/6rp9.modellerfix.pdb\n",
"notrunMD/7rrg.modellerfix.pdb\n",
"notrunMD/5d2n.modellerfix.pdb\n",
"notrunMD/4ozi.modellerfix.pdb\n",
"notrunMD/2vlr.modellerfix.pdb\n",
"notrunMD/3o4l.modellerfix.pdb\n",
"notrunMD/5ksa.modellerfix.pdb\n",
"notrunMD/7n6e.modellerfix.pdb\n",
"notrunMD/4p2q.modellerfix.pdb\n",
"notrunMD/6uk4.modellerfix.pdb\n",
"notrunMD/6vmx.modellerfix.pdb\n",
"notrunMD/7z50.modellerfix.pdb\n",
"notrunMD/5d2l.modellerfix.pdb\n",
"notrunMD/4z7v.modellerfix.pdb\n",
"notrunMD/5e6i.modellerfix.pdb\n",
"notrunMD/6uon.modellerfix.pdb\n",
"notrunMD/6vqo.modellerfix.pdb\n",
"notrunMD/5wkh.modellerfix.pdb\n",
"notrunMD/5ksb.modellerfix.pdb\n",
"notrunMD/6bj2.modellerfix.pdb\n",
"notrunMD/7sg1.modellerfix.pdb\n",
"notrunMD/5c09.modellerfix.pdb\n",
"notrunMD/3mv8.modellerfix.pdb\n",
"notrunMD/4gg6.modellerfix.pdb\n",
"notrunMD/5nmg.modellerfix.pdb\n",
"notrunMD/5c0b.modellerfix.pdb\n",
"notrunMD/5c0c.modellerfix.pdb\n",
"notrunMD/5wlg.modellerfix.pdb\n",
"notrunMD/6u3o.modellerfix.pdb\n",
"notrunMD/7t2b.modellerfix.pdb\n",
"notrunMD/4e41.modellerfix.pdb\n"
]
}
],
"source": [
"!tar zcvf notrunMD.tar.gz notrunMD/"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.13"
}
},
"nbformat": 4,
"nbformat_minor": 2
}