"""
PDB file format reading and writing.
This module provides functions for reading and writing PDB files containing
atomic coordinate data.
Functions
---------
read
Read a PDB file and return a reader object.
write
Write atomic coordinates to a PDB file.
find_header_length
Find the number of header lines in a PDB file.
load_as_dataframe
Load a PDB file into a pandas DataFrame.
read_crystallographic_info
Extract unit cell and space group from a PDB file.
Classes
-------
PDBReader
Reader class for PDB files.
Examples
--------
::
from torchref.io import pdb
# Reading
reader = pdb.read('structure.pdb', verbose=1)
df, cell, spacegroup = reader()
# Writing
pdb.write(df, 'output.pdb')
"""
from typing import List, Optional, Tuple
import numpy as np
import pandas as pd
[docs]
def read_crystallographic_info(
filepath: str,
) -> Tuple[Optional[List[float]], Optional[str], Optional[str]]:
"""
Extract crystallographic information from a PDB file.
Reads the CRYST1 record to obtain unit cell parameters and space group.
Parameters
----------
filepath : str
Path to the PDB file.
Returns
-------
cell : list of float or None
Unit cell parameters [a, b, c, alpha, beta, gamma] in A and degrees.
spacegroup : str or None
Space group symbol.
z : str or None
Number of molecules per unit cell.
"""
with open(filepath, "r") as f:
for line in f:
if "CRYST1" in line:
a = line[6:15]
b = line[15:24]
c = line[24:33]
alpha = line[33:40]
beta = line[40:47]
gamma = line[47:54]
spacegroup = line[55:68].strip()
z = line[68:].strip()
cell = [
float(a),
float(b),
float(c),
float(alpha),
float(beta),
float(gamma),
]
return cell, spacegroup, z
return None, None, None
[docs]
def load_as_dataframe(
filepath: str, skipheader: int = 0, skipfooter: int = 1
) -> pd.DataFrame:
"""
Load a PDB file into a pandas DataFrame.
Parses ATOM, HETATM, and ANISOU records from a PDB file and returns
a structured DataFrame with all atomic properties.
Parameters
----------
filepath : str
Path to the PDB file.
skipheader : int, optional
Number of header lines to skip. If 0, automatically detected.
skipfooter : int, optional
Number of footer lines to skip. Default is 1.
Returns
-------
pd.DataFrame
DataFrame with columns: ATOM, serial, name, altloc, resname, chainid,
resseq, icode, x, y, z, occupancy, tempfactor, element, charge,
anisou_flag, u11, u22, u33, u12, u13, u23, index.
DataFrame attributes include 'cell', 'spacegroup', and 'z'.
"""
if skipheader == 0:
skipheader = find_header_length(filepath)
colspecs = [
(0, 6),
(6, 11),
(12, 16),
(16, 17),
(17, 20),
(21, 22),
(22, 26),
(26, 27),
(30, 38),
(38, 46),
(46, 54),
(54, 60),
(60, 66),
(76, 78),
(78, 80),
]
names = [
"ATOM",
"serial",
"name",
"altloc",
"resname",
"chainid",
"resseq",
"icode",
"x",
"y",
"z",
"occupancy",
"tempfactor",
"element",
"charge",
]
pdb = pd.read_fwf(
filepath,
names=names,
colspecs=colspecs,
skiprows=skipheader,
skipfooter=skipfooter,
keep_default_na=False,
na_values=[""],
)
pdb["anisou_flag"] = False
# Read ANISOU records
anisou_names = [
"ATOM",
"serial",
"name",
"altloc",
"resname",
"chainid",
"resseq",
"u11",
"u22",
"u33",
"u12",
"u13",
"u23",
"element",
]
anisou_colspecs = [
(0, 6),
(6, 11),
(12, 16),
(16, 17),
(17, 20),
(21, 22),
(22, 26),
(29, 35),
(36, 42),
(43, 49),
(50, 56),
(57, 63),
(63, 70),
(76, 78),
]
anisou = pd.read_fwf(
filepath,
names=anisou_names,
colspecs=anisou_colspecs,
skiprows=skipheader,
skipfooter=skipfooter,
keep_default_na=False,
na_values=[""],
)
anisou = anisou.loc[anisou["ATOM"] == "ANISOU"]
pdb = pdb.loc[(pdb["ATOM"] == "ATOM") | (pdb["ATOM"] == "HETATM")]
anisou.drop(columns=["ATOM"], inplace=True)
pdb = pdb.merge(
anisou,
on=["serial", "name", "altloc", "resname", "chainid", "resseq", "element"],
how="left",
)
pdb.loc[pdb["u11"].notnull(), "anisou_flag"] = True
pdb[["u11", "u22", "u33", "u12", "u13", "u23"]] = (
pdb[["u11", "u22", "u33", "u12", "u13", "u23"]].astype(float) / 1e4
)
pdb[["serial", "resseq"]] = pdb[["serial", "resseq"]].astype(int)
pdb[["x", "y", "z", "occupancy", "tempfactor"]] = pdb[
["x", "y", "z", "occupancy", "tempfactor"]
].astype(float)
pdb[["altloc", "icode"]] = pdb[["altloc", "icode"]].fillna("")
pdb["charge"] = (
pdb["charge"]
.astype(str)
.str.strip("+")
.str.replace("1-", "-1")
.str.replace("2-", "-2")
.astype(float)
.fillna(0)
.astype(int)
)
pdb["element"] = pdb["element"].astype(str).str.strip().str.capitalize()
pdb["index"] = np.arange(pdb.shape[0]).astype(int)
try:
cell, spacegroup, z = read_crystallographic_info(filepath)
except:
cell, spacegroup, z = None, None, None
pdb.attrs["cell"] = cell
pdb.attrs["spacegroup"] = spacegroup
pdb.attrs["z"] = z
return pdb
[docs]
class PDBReader:
"""
Reader for PDB files containing atomic coordinate data.
This class reads PDB files and extracts atomic coordinates, properties,
and crystallographic metadata.
Attributes
----------
verbose : int
Verbosity level for logging.
dataframe : pd.DataFrame
DataFrame containing atomic data.
cell : list or None
Unit cell parameters [a, b, c, alpha, beta, gamma].
spacegroup : str or None
Space group symbol.
Examples
--------
::
reader = pdb.read('structure.pdb', verbose=1)
df, cell, spacegroup = reader()
print(f"Loaded {len(df)} atoms")
"""
[docs]
def __init__(self, verbose: int = 0):
"""
Initialize PDB reader.
Parameters
----------
verbose : int, optional
Verbosity level (0=silent, 1=normal, 2=debug). Default is 0.
"""
self.verbose = verbose
self.dataframe = None
self.cell = None
self.spacegroup = None
self.z = None
self.links = None
[docs]
def read(self, filepath: str) -> "PDBReader":
"""
Read a PDB file and extract atomic data.
Parameters
----------
filepath : str
Path to the PDB file.
Returns
-------
PDBReader
Self, for method chaining.
"""
if self.verbose > 1:
print(f"Reading PDB file: {filepath}")
self.dataframe = load_as_dataframe(filepath)
self.cell, self.spacegroup, self.z = read_crystallographic_info(filepath)
self.links = extract_link_records(filepath, verbose=self.verbose)
if self.verbose > 0:
print(f"Loaded {len(self.dataframe)} atoms")
return self
[docs]
def __call__(self) -> Tuple[pd.DataFrame, Optional[np.ndarray], Optional[str]]:
"""
Return extracted data in a standardized format.
Returns
-------
dataframe : pd.DataFrame
DataFrame with atomic data.
cell : np.ndarray or None
Unit cell parameters [a, b, c, alpha, beta, gamma].
spacegroup : str or None
Space group symbol.
"""
if self.dataframe is None:
raise ValueError("No data loaded. Call read() first.")
return self.dataframe, self.cell, self.spacegroup
[docs]
def read(filepath: str, verbose: int = 0) -> PDBReader:
"""
Read a PDB file.
Parameters
----------
filepath : str
Path to the PDB file.
verbose : int, optional
Verbosity level. Default is 0.
Returns
-------
PDBReader
Reader object with data loaded.
"""
return PDBReader(verbose=verbose).read(filepath)
[docs]
def write(df: pd.DataFrame, filepath: str, template: str = None, metadata=None) -> None:
"""
Write a DataFrame to a PDB file.
Parameters
----------
df : pandas.DataFrame
DataFrame containing atom data with columns: ATOM, serial, name,
altloc, resname, chainid, resseq, icode, x, y, z, occupancy,
tempfactor, element, charge.
filepath : str
Output PDB filename.
template : str, optional
PDB template file to copy header from (deprecated, use metadata).
metadata : RefinementMetadata, optional
Metadata to render as PDB header (REMARK 3, TITLE, etc.).
"""
with open(filepath, "w") as n:
# Write metadata header if provided (before CRYST1)
if metadata is not None:
n.write(metadata.render_pdb_header())
# Copy template header if provided (deprecated path)
if template is not None:
with open(template) as t:
for line in t:
if "REMARK" not in line and "ATOM" in line:
break
n.write(line)
# Write CRYST1 record if cell info available (directly before atoms)
try:
cell = df.attrs["cell"]
spacegroup = df.attrs["spacegroup"]
cell_abc = cell[:3]
cell_angles = cell[3:]
z = df.attrs.get("z", "")
try:
strz = str(int(z))
except:
strz = ""
line = (
"CRYST1"
+ "".join([f"{i:>9.3f}" for i in cell_abc])
+ "".join([f"{i:>7.2f}" for i in cell_angles])
+ " "
+ f"{spacegroup:<14}"
+ strz
+ "\n"
)
n.write(line)
except:
print("No cell information found, writing without cell and spacegroup")
# Write atom records
for i, row in df.iterrows():
(
ATOM,
serial,
name,
altloc,
resname,
chainid,
resseq,
icode,
x,
y,
z_coord,
occupancy,
tempfactor,
element,
charge,
) = row[
[
"ATOM",
"serial",
"name",
"altloc",
"resname",
"chainid",
"resseq",
"icode",
"x",
"y",
"z",
"occupancy",
"tempfactor",
"element",
"charge",
]
]
if charge > 0:
charge = "+" + str(charge)
elif charge == 0:
charge = ""
else:
charge = str(charge)
if len(name) > 3:
name = name[-3:]
if len(name) < 3:
name = name + " " * (3 - len(name))
if chainid is None or str(chainid) == "nan":
chainid = ""
try:
s = (
f"{str(ATOM):<6}{int(serial):>5}{str(name):>5}{str(altloc):>1}"
f"{str(resname):>3}{str(chainid):>2}{int(resseq):>4}{str(icode):>4}"
f"{round(x, 3):>8}{round(y, 3):>8}{round(z_coord, 3):>8}"
f"{round(occupancy, 3):>6.2f}{round(tempfactor, 2):>6}"
f"{str(element):>12}{charge:>2}\n"
)
n.write(s)
except:
print("row", i, "failed")
print(row)
# Write ANISOU record if present
if row["anisou_flag"]:
u11, u22, u33, u12, u13, u23 = row[
["u11", "u22", "u33", "u12", "u13", "u23"]
]
s = (
f"ANISOU{int(serial):>5}{str(name):>5}{str(altloc):>1}"
f"{str(resname):>3}{str(chainid):>2}{int(resseq):>4} "
f"{int(u11 * 1e4):>{7}}{int(u22 * 1e4):>{7}}{int(u33 * 1e4):>{7}}"
f"{int(u12 * 1e4):>{7}}{int(u13 * 1e4):>{7}}{int(u23 * 1e4):>{7}}"
f" {str(element):>{2}}{str(charge):>2}\n"
)
n.write(s)
n.write("END")
[docs]
def write_multi_model(
dataframes: List[pd.DataFrame],
filepath: str,
model_names: Optional[List[str]] = None,
) -> None:
"""
Write multiple models to a single PDB file with MODEL/ENDMDL records.
Each DataFrame is wrapped in a MODEL/ENDMDL pair, producing a
multi-model PDB file suitable for ensemble or time-resolved data.
Parameters
----------
dataframes : list of pandas.DataFrame
List of atom DataFrames (same format as ``write()`` expects).
filepath : str
Output PDB filename.
model_names : list of str, optional
Names for each model (written as REMARK before each MODEL record).
If None, models are numbered sequentially.
"""
if not dataframes:
return
with open(filepath, "w") as f:
# Write CRYST1 from first model if available
first_df = dataframes[0]
try:
cell = first_df.attrs["cell"]
spacegroup = first_df.attrs["spacegroup"]
cell_abc = cell[:3]
cell_angles = cell[3:]
z = first_df.attrs.get("z", "")
try:
strz = str(int(z))
except Exception:
strz = ""
line = (
"CRYST1"
+ "".join([f"{i:>9.3f}" for i in cell_abc])
+ "".join([f"{i:>7.2f}" for i in cell_angles])
+ " "
+ f"{spacegroup:<14}"
+ strz
+ "\n"
)
f.write(line)
except Exception:
pass
for model_idx, df in enumerate(dataframes):
model_num = model_idx + 1
if model_names and model_idx < len(model_names):
f.write(f"REMARK 3 MODEL {model_num}: {model_names[model_idx]}\n")
f.write(f"MODEL {model_num:>4}\n")
for i, row in df.iterrows():
ATOM = row.get("ATOM", "ATOM")
serial = row.get("serial", i + 1)
name = str(row.get("name", "CA"))
altloc = str(row.get("altloc", ""))
resname = str(row.get("resname", "UNK"))
chainid = str(row.get("chainid", ""))
resseq = int(row.get("resseq", 1))
icode = str(row.get("icode", ""))
x = float(row.get("x", 0.0))
y = float(row.get("y", 0.0))
z_coord = float(row.get("z", 0.0))
occupancy = float(row.get("occupancy", 1.0))
tempfactor = float(row.get("tempfactor", 20.0))
element = str(row.get("element", "C"))
charge = row.get("charge", 0)
if charge > 0:
charge_str = "+" + str(charge)
elif charge == 0:
charge_str = ""
else:
charge_str = str(charge)
if len(name) > 3:
name = name[-3:]
if len(name) < 3:
name = name + " " * (3 - len(name))
if chainid is None or chainid == "nan":
chainid = ""
try:
s = (
f"{str(ATOM):<6}{int(serial):>5}{name:>5}{altloc:>1}"
f"{resname:>3}{chainid:>2}{resseq:>4}{icode:>4}"
f"{round(x, 3):>8}{round(y, 3):>8}{round(z_coord, 3):>8}"
f"{round(occupancy, 3):>6.2f}{round(tempfactor, 2):>6}"
f"{element:>12}{charge_str:>2}\n"
)
f.write(s)
except Exception:
pass
f.write("ENDMDL\n")
f.write("END\n")
# Legacy aliases for backwards compatibility during transition
PDB = PDBReader
find_header_length_pdb_file = find_header_length
load_pdb_as_pd = load_as_dataframe