diff --git a/build_modeller.py b/build_modeller.py index 11c6d09..1fbba11 100755 --- a/build_modeller.py +++ b/build_modeller.py @@ -25,6 +25,54 @@ class PDBModeler: self.sequence = self.fasta_file.stem self.ali_file = self.fasta_to_ali() + @staticmethod + def find_non_dash_indices(seq): + start = next((i for i, c in enumerate(seq) if c != '-'), None) + end = next((i for i, c in enumerate(reversed(seq)) if c != '-'), None) + if start is not None and end is not None: + end = len(seq) - end + return start, end + + @staticmethod + def align_sequences(file: Path) -> Path: + fx = pyfastx.Fasta(file.as_posix(), build_index=True) + assert len(fx) == 2 + seqs = [seq for seq in fx] + + # 确定哪条链需要裁剪 + if seqs[0].seq.startswith('-') or seqs[0].seq.endswith('-'): + trim_index = 0 + elif seqs[1].seq.startswith('-') or seqs[1].seq.endswith('-'): + trim_index = 1 + else: + # 如果两条链都不需要裁剪,就直接返回原文件 + return file + + start, end = PDBModeler.find_non_dash_indices(seqs[trim_index].seq) + + # 根据确定的裁剪位置裁剪两条链 + trimmed_seqs = [] + for seq in seqs: + trimmed_seq = seq.seq[start:end] + trimmed_seqs.append(SeqRecord(Seq(trimmed_seq), id=seq.name, description="")) + + # 选择没有'-'的序列 + selected_seq = None + for seq_record in trimmed_seqs: + if '-' not in seq_record.seq: + selected_seq = seq_record + break + + assert selected_seq is not None, "no sequence without '-' found" + assert not selected_seq.seq.startswith('-') and not selected_seq.seq.endswith('-'), "selected sequence should not start or end with '-'" + + # Write the selected sequence to a new FASTA file using Biopython + new_fasta_file = file.with_suffix('.selected.fasta') + with open(new_fasta_file, 'w') as output_handle: + SeqIO.write([selected_seq], output_handle, "fasta") + + return new_fasta_file + def make_model(self): # 单模板建模 print("***************************************************") print("md_level ====", self.md_level)