Source code for torchref.cli.strip_altlocs

#!/usr/bin/env python3
"""Strip alternate conformations from a PDB file.

For each residue with altlocs, keeps the conformer with the higher
occupancy. If occupancies are equal, keeps altloc A.  Atoms without
altloc are always kept.

Usage
-----
::

    torchref.strip-altlocs input.pdb output.pdb
"""

import argparse
import sys

import pandas as pd

from torchref.io import pdb


[docs] def strip_altlocs(df): """Remove alternate conformations, keeping the higher-occupancy conformer. Parameters ---------- df : pd.DataFrame PDB dataframe with altloc column. Returns ------- pd.DataFrame Dataframe with altlocs removed. """ no_alt = df["altloc"].isna() | df["altloc"].isin(["", " "]) if no_alt.all(): return df.copy() has_alt = df[~no_alt] res_cols = ["chainid", "resseq", "icode", "resname"] # For each residue, find which altloc has the highest mean occupancy best_altloc = ( has_alt.groupby(res_cols + ["altloc"])["occupancy"] .mean() .reset_index() .sort_values(["occupancy", "altloc"], ascending=[False, True]) .drop_duplicates(subset=res_cols, keep="first") .set_index(res_cols)["altloc"] ) # Keep atoms without altloc + atoms matching the best altloc keep_mask = no_alt.copy() for _, row in has_alt.iterrows(): key = tuple(row[c] for c in res_cols) if key in best_altloc.index and row["altloc"] == best_altloc[key]: keep_mask[row.name] = True result = df[keep_mask].copy() result["altloc"] = "" result["serial"] = range(1, len(result) + 1) return result
[docs] def main(): parser = argparse.ArgumentParser( prog="torchref.strip-altlocs", description="Strip alternate conformations from a PDB file, " "keeping the conformer with the highest occupancy.", ) parser.add_argument("input", help="Input PDB file") parser.add_argument("output", help="Output PDB file") args = parser.parse_args() df = pdb.load_as_dataframe(args.input) n_before = len(df) altlocs = df["altloc"].unique() altlocs = [a for a in altlocs if a != "" and pd.notna(a)] n_residues_with_alt = df[df["altloc"].isin(altlocs)].groupby( ["chainid", "resseq"] ).ngroups result = strip_altlocs(df) result.attrs = df.attrs n_after = len(result) print(f"Input: {n_before} atoms, {n_residues_with_alt} residues with altlocs") print(f"Output: {n_after} atoms (removed {n_before - n_after})") pdb.write(result, args.output) print(f"Written to {args.output}") return 0
if __name__ == "__main__": sys.exit(main() or 0)