{ "cells": [ { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "MD_finish = '''1ao7\n", "1mi5\n", "1oga\n", "2vlj\n", "2vlk\n", "3o41\n", "4ftv\n", "1bd2\n", "1fo0\n", "1j8h\n", "inam\n", "2bnr\n", "2iam\n", "2ol3\n", "2p5e\n", "2z31\n", "3h9s\n", "3hg1\n", "3kps\n", "3pwp\n", "3qdg\n", "3qdm\n", "3qib\n", "3vxm\n", "3vxs\n", "4jrx\n", "5brz\n", "5bs0\n", "4g9f\n", "5nht\n", "5nqk\n", "5isz\n", "3gsn\n", "1d9k\n", "1u3h\n", "3dxa\n", "3ffc\n", "3kpr\n", "3sjv\n", "3utt\n", "4eup\n", "4h1l\n", "4mji\n", "4qrp\n", "5c0a\n", "4ozf\n", "4pri\n", "5eu6\n", "5ivx\n", "5m02\n", "5men\n", "5tez\n", "5yxn\n", "6am5\n", "6amu\n", "6avf\n", "6cql\n", "6g9q\n", "6mtm\n", "6px6\n", "6py2\n", "6r0e\n", "6r2l\n", "6rpa\n", "6tro\n", "6vrm\n", "6vrn\n", "6zkw\n", "7dzm\n", "7jwj\n", "7n1e\n", "7n1f\n", "7n2n\n", "7na5\n", "7ndq\n", "7nme\n", "7ow5\n", "7qpj\n", "7rtr\n", "7sg0\n", "7t2c\n", "8gom\n", "8gvb\n", "1zgl\n", "1g6r\n", "3qiu\n", "4p2o\n", "6u3n\n", "2ypl'''" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "run_MD = set(MD_finish.splitlines())" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "89" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(run_MD)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/tmp/ipykernel_192237/2850835352.py:1: DeprecationWarning: \n", "Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),\n", "(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)\n", "but was not found to be installed on your system.\n", "If this would cause problems for you,\n", "please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466\n", " \n", " import pandas as pd\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\n" ] } ], "source": [ "import pandas as pd\n", "\n", "# 假设Excel文件的路径为'path_to_excel_file.xlsx'\n", "file_path = 'TCR-pMHC4.xlsx'\n", "\n", "# 读取名为'sheet1'的工作表,第一行作为列名\n", "df = pd.read_excel(file_path, sheet_name='Sheet1', header=0, engine='calamine')\n", "\n", "# 显示数据帧以确认加载正确\n", "print(df.head)" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "column_set = set(df['pdb_id'])" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "146" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(column_set)" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "89" ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(run_MD)" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [], "source": [ "diff = column_set - run_MD" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "59" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(diff)" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "110" ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(df[df['MHC_class'] == 1])" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "36" ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(df[df['MHC_class'] == 2])" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'1kj2',\n", " '1mwa',\n", " '1nam',\n", " '2nx5',\n", " '2vlr',\n", " '3mbe',\n", " '3mv7',\n", " '3mv8',\n", " '3mv9',\n", " '3o4l',\n", " '3pqy',\n", " '4e41',\n", " '4gg6',\n", " '4ozi',\n", " '4p2q',\n", " '4p2r',\n", " '4z7u',\n", " '4z7v',\n", " '4z7w',\n", " '5c07',\n", " '5c08',\n", " '5c09',\n", " '5c0b',\n", " '5c0c',\n", " '5d2l',\n", " '5d2n',\n", " '5e6i',\n", " '5e9d',\n", " '5euo',\n", " '5jhd',\n", " '5ks9',\n", " '5ksa',\n", " '5ksb',\n", " '5nme',\n", " '5nmg',\n", " '5wkh',\n", " '5wlg',\n", " '6avg',\n", " '6bga',\n", " '6bj2',\n", " '6rp9',\n", " '6rpb',\n", " '6rsy',\n", " '6u3o',\n", " '6uk4',\n", " '6uln',\n", " '6uon',\n", " '6vm8',\n", " '6vmx',\n", " '6vqo',\n", " '7l1d',\n", " '7n6e',\n", " '7phr',\n", " '7rdv',\n", " '7rk7',\n", " '7rrg',\n", " '7sg1',\n", " '7t2b',\n", " '7z50'}" ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "diff" ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [], "source": [ "!rm -rf ./fixed/*" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [], "source": [ "!cp single_polymeric_fix/*.pdb ./fixed\n", "!cp multimer_polymeric_fix/*.pdb ./fixed" ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "146" ] }, "execution_count": 36, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from pathlib import Path\n", "len(list(Path('fixed').glob('*.pdb')))" ] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [], "source": [ "!mkdir -p notrunMD" ] }, { "cell_type": "code", "execution_count": 38, "metadata": {}, "outputs": [], "source": [ "from pathlib import Path\n", "import shutil\n", "\n", "for i in diff:\n", " shutil.copy(Path('./fixed').joinpath(f'{i}.modellerfix.pdb').as_posix(), Path('./notrunMD').joinpath(f'{i}.modellerfix.pdb').as_posix())" ] }, { "cell_type": "code", "execution_count": 40, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "59" ] }, "execution_count": 40, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(list(Path('./notrunMD').glob('*.pdb')))" ] }, { "cell_type": "code", "execution_count": 39, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "notrunMD/\n", "notrunMD/6uln.modellerfix.pdb\n", "notrunMD/5ks9.modellerfix.pdb\n", "notrunMD/1mwa.modellerfix.pdb\n", "notrunMD/4z7w.modellerfix.pdb\n", "notrunMD/7rdv.modellerfix.pdb\n", "notrunMD/4z7u.modellerfix.pdb\n", "notrunMD/5euo.modellerfix.pdb\n", "notrunMD/5c08.modellerfix.pdb\n", "notrunMD/6bga.modellerfix.pdb\n", "notrunMD/1nam.modellerfix.pdb\n", "notrunMD/5c07.modellerfix.pdb\n", "notrunMD/7l1d.modellerfix.pdb\n", "notrunMD/6vm8.modellerfix.pdb\n", "notrunMD/7rk7.modellerfix.pdb\n", "notrunMD/1kj2.modellerfix.pdb\n", "notrunMD/6rpb.modellerfix.pdb\n", "notrunMD/5nme.modellerfix.pdb\n", "notrunMD/3mbe.modellerfix.pdb\n", "notrunMD/6avg.modellerfix.pdb\n", "notrunMD/5jhd.modellerfix.pdb\n", "notrunMD/4p2r.modellerfix.pdb\n", "notrunMD/3pqy.modellerfix.pdb\n", "notrunMD/5e9d.modellerfix.pdb\n", "notrunMD/3mv7.modellerfix.pdb\n", "notrunMD/3mv9.modellerfix.pdb\n", "notrunMD/2nx5.modellerfix.pdb\n", "notrunMD/6rsy.modellerfix.pdb\n", "notrunMD/7phr.modellerfix.pdb\n", "notrunMD/6rp9.modellerfix.pdb\n", "notrunMD/7rrg.modellerfix.pdb\n", "notrunMD/5d2n.modellerfix.pdb\n", "notrunMD/4ozi.modellerfix.pdb\n", "notrunMD/2vlr.modellerfix.pdb\n", "notrunMD/3o4l.modellerfix.pdb\n", "notrunMD/5ksa.modellerfix.pdb\n", "notrunMD/7n6e.modellerfix.pdb\n", "notrunMD/4p2q.modellerfix.pdb\n", "notrunMD/6uk4.modellerfix.pdb\n", "notrunMD/6vmx.modellerfix.pdb\n", "notrunMD/7z50.modellerfix.pdb\n", "notrunMD/5d2l.modellerfix.pdb\n", "notrunMD/4z7v.modellerfix.pdb\n", "notrunMD/5e6i.modellerfix.pdb\n", "notrunMD/6uon.modellerfix.pdb\n", "notrunMD/6vqo.modellerfix.pdb\n", "notrunMD/5wkh.modellerfix.pdb\n", "notrunMD/5ksb.modellerfix.pdb\n", "notrunMD/6bj2.modellerfix.pdb\n", "notrunMD/7sg1.modellerfix.pdb\n", "notrunMD/5c09.modellerfix.pdb\n", "notrunMD/3mv8.modellerfix.pdb\n", "notrunMD/4gg6.modellerfix.pdb\n", "notrunMD/5nmg.modellerfix.pdb\n", "notrunMD/5c0b.modellerfix.pdb\n", "notrunMD/5c0c.modellerfix.pdb\n", "notrunMD/5wlg.modellerfix.pdb\n", "notrunMD/6u3o.modellerfix.pdb\n", "notrunMD/7t2b.modellerfix.pdb\n", "notrunMD/4e41.modellerfix.pdb\n" ] } ], "source": [ "!tar zcvf notrunMD.tar.gz notrunMD/" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "2024-03-07 \n", "['5ksa', '1g6r', '6bga', '6u3n', '1zgl', '7l1d', '3qiu', '2ypl', '7z50', '6uln', '1mwa', '4z7v', '4ozi', '7rrg', '6uk4', '4p2o', '6vm8', '5ksb'] 重新建模\n", "1g6r: Align 对齐过程中,序列编号出错导致残基缺失,进而导致MD模拟后续失败\n", "6u3n: A开头地方与B链的碰撞\n", "6uk4: 结构分散(不适合模拟)\n", "6uln: 结构分散(不适合模拟)\n", "6vm8: 结构分散(不适合模拟)\n", "7l1d: 无碰撞,结构分散(不适合模拟)\n", "7rrg: 无碰撞,结构分散(不适合模拟)\n", "7z50: A与B链的碰撞\n", "1mwa: B链116缺失\n", "1zgl: M链122,123缺失\n", "2ypl: D链124E链121碰撞\n", "3qiu: A与B链的碰撞\n", "4ozi: A链72位与B链5位(β折叠)碰撞\n", "4p2o: A链67位与B链4位碰撞\n", "4z7v: C链96位与D链105碰撞\n", "5ksa: 缺失过多导致修复后loop过多(不适合模拟)\n", "5ksb: C链83位与D链5位碰撞\n", "6bga: B链137,138虚线缺失\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.13" } }, "nbformat": 4, "nbformat_minor": 2 }