diff --git a/diff.ipynb b/diff.ipynb new file mode 100644 index 0000000..7403d56 --- /dev/null +++ b/diff.ipynb @@ -0,0 +1,617 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "MD_finish = '''1ao7\n", + "1mi5\n", + "1oga\n", + "2vlj\n", + "2vlk\n", + "3o41\n", + "4ftv\n", + "1bd2\n", + "1fo0\n", + "1j8h\n", + "inam\n", + "2bnr\n", + "2iam\n", + "2ol3\n", + "2p5e\n", + "2z31\n", + "3h9s\n", + "3hg1\n", + "3kps\n", + "3pwp\n", + "3qdg\n", + "3qdm\n", + "3qib\n", + "3vxm\n", + "3vxs\n", + "4jrx\n", + "5brz\n", + "5bs0\n", + "4g9f\n", + "5nht\n", + "5nqk\n", + "5isz\n", + "3gsn\n", + "1d9k\n", + "1u3h\n", + "3dxa\n", + "3ffc\n", + "3kpr\n", + "3sjv\n", + "3utt\n", + "4eup\n", + "4h1l\n", + "4mji\n", + "4qrp\n", + "5c0a\n", + "4ozf\n", + "4pri\n", + "5eu6\n", + "5ivx\n", + "5m02\n", + "5men\n", + "5tez\n", + "5yxn\n", + "6am5\n", + "6amu\n", + "6avf\n", + "6cql\n", + "6g9q\n", + "6mtm\n", + "6px6\n", + "6py2\n", + "6r0e\n", + "6r2l\n", + "6rpa\n", + "6tro\n", + "6vrm\n", + "6vrn\n", + "6zkw\n", + "7dzm\n", + "7jwj\n", + "7n1e\n", + "7n1f\n", + "7n2n\n", + "7na5\n", + "7ndq\n", + "7nme\n", + "7ow5\n", + "7qpj\n", + "7rtr\n", + "7sg0\n", + "7t2c\n", + "8gom\n", + "8gvb\n", + "1zgl\n", + "1g6r\n", + "3qiu\n", + "4p2o\n", + "6u3n\n", + "2ypl'''" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "run_MD = set(MD_finish.splitlines())" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "89" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(run_MD)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/tmp/ipykernel_192237/2850835352.py:1: DeprecationWarning: \n", + "Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),\n", + "(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)\n", + "but was not found to be installed on your system.\n", + "If this would cause problems for you,\n", + "please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466\n", + " \n", + " import pandas as pd\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "\n", + "# 假设Excel文件的路径为'path_to_excel_file.xlsx'\n", + "file_path = 'TCR-pMHC4.xlsx'\n", + "\n", + "# 读取名为'sheet1'的工作表,第一行作为列名\n", + "df = pd.read_excel(file_path, sheet_name='Sheet1', header=0, engine='calamine')\n", + "\n", + "# 显示数据帧以确认加载正确\n", + "print(df.head)" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "column_set = set(df['pdb_id'])" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "146" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(column_set)" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "89" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(run_MD)" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [], + "source": [ + "diff = column_set - run_MD" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "59" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(diff)" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "110" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(df[df['MHC_class'] == 1])" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "36" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(df[df['MHC_class'] == 2])" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'1kj2',\n", + " '1mwa',\n", + " '1nam',\n", + " '2nx5',\n", + " '2vlr',\n", + " '3mbe',\n", + " '3mv7',\n", + " '3mv8',\n", + " '3mv9',\n", + " '3o4l',\n", + " '3pqy',\n", + " '4e41',\n", + " '4gg6',\n", + " '4ozi',\n", + " '4p2q',\n", + " '4p2r',\n", + " '4z7u',\n", + " '4z7v',\n", + " '4z7w',\n", + " '5c07',\n", + " '5c08',\n", + " '5c09',\n", + " '5c0b',\n", + " '5c0c',\n", + " '5d2l',\n", + " '5d2n',\n", + " '5e6i',\n", + " '5e9d',\n", + " '5euo',\n", + " '5jhd',\n", + " '5ks9',\n", + " '5ksa',\n", + " '5ksb',\n", + " '5nme',\n", + " '5nmg',\n", + " '5wkh',\n", + " '5wlg',\n", + " '6avg',\n", + " '6bga',\n", + " '6bj2',\n", + " '6rp9',\n", + " '6rpb',\n", + " '6rsy',\n", + " '6u3o',\n", + " '6uk4',\n", + " '6uln',\n", + " '6uon',\n", + " '6vm8',\n", + " '6vmx',\n", + " '6vqo',\n", + " '7l1d',\n", + " '7n6e',\n", + " '7phr',\n", + " '7rdv',\n", + " '7rk7',\n", + " '7rrg',\n", + " '7sg1',\n", + " '7t2b',\n", + " '7z50'}" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "diff" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [], + "source": [ + "!rm -rf ./fixed/*" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [], + "source": [ + "!cp single_polymeric_fix/*.pdb ./fixed\n", + "!cp multimer_polymeric_fix/*.pdb ./fixed" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "146" + ] + }, + "execution_count": 36, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from pathlib import Path\n", + "len(list(Path('fixed').glob('*.pdb')))" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [], + "source": [ + "!mkdir -p notrunMD" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": {}, + "outputs": [], + "source": [ + "from pathlib import Path\n", + "import shutil\n", + "\n", + "for i in diff:\n", + " shutil.copy(Path('./fixed').joinpath(f'{i}.modellerfix.pdb').as_posix(), Path('./notrunMD').joinpath(f'{i}.modellerfix.pdb').as_posix())" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "59" + ] + }, + "execution_count": 40, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(list(Path('./notrunMD').glob('*.pdb')))" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "notrunMD/\n", + "notrunMD/6uln.modellerfix.pdb\n", + "notrunMD/5ks9.modellerfix.pdb\n", + "notrunMD/1mwa.modellerfix.pdb\n", + "notrunMD/4z7w.modellerfix.pdb\n", + "notrunMD/7rdv.modellerfix.pdb\n", + "notrunMD/4z7u.modellerfix.pdb\n", + "notrunMD/5euo.modellerfix.pdb\n", + "notrunMD/5c08.modellerfix.pdb\n", + "notrunMD/6bga.modellerfix.pdb\n", + "notrunMD/1nam.modellerfix.pdb\n", + "notrunMD/5c07.modellerfix.pdb\n", + "notrunMD/7l1d.modellerfix.pdb\n", + "notrunMD/6vm8.modellerfix.pdb\n", + "notrunMD/7rk7.modellerfix.pdb\n", + "notrunMD/1kj2.modellerfix.pdb\n", + "notrunMD/6rpb.modellerfix.pdb\n", + "notrunMD/5nme.modellerfix.pdb\n", + "notrunMD/3mbe.modellerfix.pdb\n", + "notrunMD/6avg.modellerfix.pdb\n", + "notrunMD/5jhd.modellerfix.pdb\n", + "notrunMD/4p2r.modellerfix.pdb\n", + "notrunMD/3pqy.modellerfix.pdb\n", + "notrunMD/5e9d.modellerfix.pdb\n", + "notrunMD/3mv7.modellerfix.pdb\n", + "notrunMD/3mv9.modellerfix.pdb\n", + "notrunMD/2nx5.modellerfix.pdb\n", + "notrunMD/6rsy.modellerfix.pdb\n", + "notrunMD/7phr.modellerfix.pdb\n", + "notrunMD/6rp9.modellerfix.pdb\n", + "notrunMD/7rrg.modellerfix.pdb\n", + "notrunMD/5d2n.modellerfix.pdb\n", + "notrunMD/4ozi.modellerfix.pdb\n", + "notrunMD/2vlr.modellerfix.pdb\n", + "notrunMD/3o4l.modellerfix.pdb\n", + "notrunMD/5ksa.modellerfix.pdb\n", + "notrunMD/7n6e.modellerfix.pdb\n", + "notrunMD/4p2q.modellerfix.pdb\n", + "notrunMD/6uk4.modellerfix.pdb\n", + "notrunMD/6vmx.modellerfix.pdb\n", + "notrunMD/7z50.modellerfix.pdb\n", + "notrunMD/5d2l.modellerfix.pdb\n", + "notrunMD/4z7v.modellerfix.pdb\n", + "notrunMD/5e6i.modellerfix.pdb\n", + "notrunMD/6uon.modellerfix.pdb\n", + "notrunMD/6vqo.modellerfix.pdb\n", + "notrunMD/5wkh.modellerfix.pdb\n", + "notrunMD/5ksb.modellerfix.pdb\n", + "notrunMD/6bj2.modellerfix.pdb\n", + "notrunMD/7sg1.modellerfix.pdb\n", + "notrunMD/5c09.modellerfix.pdb\n", + "notrunMD/3mv8.modellerfix.pdb\n", + "notrunMD/4gg6.modellerfix.pdb\n", + "notrunMD/5nmg.modellerfix.pdb\n", + "notrunMD/5c0b.modellerfix.pdb\n", + "notrunMD/5c0c.modellerfix.pdb\n", + "notrunMD/5wlg.modellerfix.pdb\n", + "notrunMD/6u3o.modellerfix.pdb\n", + "notrunMD/7t2b.modellerfix.pdb\n", + "notrunMD/4e41.modellerfix.pdb\n" + ] + } + ], + "source": [ + "!tar zcvf notrunMD.tar.gz notrunMD/" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.13" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}