Source code for torchref.io.cif

"""
CIF/mmCIF file format reading and writing.

This module provides functions for reading and writing CIF (Crystallographic
Information File) format data, including:
- Reflection data (structure factors)
- Model data (atomic coordinates)
- Restraint dictionaries
- Electron density maps

Functions
---------
read_reflections
    Read reflection data from a CIF file.
read_model
    Read atomic coordinates from a CIF file.
read_restraints
    Read restraint dictionary from a CIF file.
list_data_blocks
    List available data blocks in a CIF file.
write_map
    Write electron density map to CCP4 format.
write_model
    Write atomic coordinates to mmCIF format.

Classes
-------
CIFReader
    Generic CIF file reader.
ReflectionCIFReader
    Reader for structure factor CIF files.
ModelCIFReader
    Reader for mmCIF coordinate files.
RestraintCIFReader
    Reader for restraint dictionary CIF files.

Examples
--------
::

    from torchref.io import cif

    # Reading reflections
    reader = cif.read_reflections('structure-sf.cif', verbose=1)
    data_dict, cell, spacegroup = reader()

    # Reading model
    reader = cif.read_model('structure.cif')
    df, cell, spacegroup = reader()

    # Writing model (mmCIF)
    from torchref.io.metadata import RefinementMetadata
    meta = RefinementMetadata(r_work=0.18, r_free=0.21)
    cif.write_model(df, 'refined.cif', metadata=meta)

    # Reading restraints
    reader = cif.read_restraints('ALA.cif')
    restraints = reader.get_all_restraints()

    # List data blocks
    blocks = cif.list_data_blocks('multi-block.cif')
"""

from typing import List, Optional

import numpy as np
import torch

# Import all CIF reader classes from the existing module
from torchref.io.cif_readers import (
    CIFReader,
    ModelCIFReader,
    ReflectionCIFReader,
    RestraintCIFReader,
)


[docs] def read_reflections( filepath: str, data_block: Optional[str] = None, verbose: int = 0 ) -> ReflectionCIFReader: """ Read reflection data from a CIF file. Parameters ---------- filepath : str Path to the structure factor CIF file. data_block : str, optional Name of the data block to read. If None, uses the first block. verbose : int, optional Verbosity level. Default is 0. Returns ------- ReflectionCIFReader Reader object with data loaded. Examples -------- :: reader = cif.read_reflections('structure-sf.cif') data_dict, cell, spacegroup = reader() hkl = data_dict['HKL'] """ return ReflectionCIFReader(filepath, verbose=verbose, data_block=data_block)
[docs] def read_model(filepath: str, verbose: int = 0) -> ModelCIFReader: """ Read atomic coordinates from a CIF file. Parameters ---------- filepath : str Path to the mmCIF coordinate file. verbose : int, optional Verbosity level. Default is 0. Returns ------- ModelCIFReader Reader object with data loaded. Examples -------- :: reader = cif.read_model('structure.cif') df, cell, spacegroup = reader() print(f"Loaded {len(df)} atoms") """ return ModelCIFReader(filepath, verbose=verbose)
[docs] def read_restraints(filepath: str) -> RestraintCIFReader: """ Read restraint dictionary from a CIF file. Parameters ---------- filepath : str Path to the restraint dictionary CIF file. Returns ------- RestraintCIFReader Reader object with restraints loaded. Examples -------- :: reader = cif.read_restraints('ALA.cif') restraints = reader.get_all_restraints() """ return RestraintCIFReader(filepath)
[docs] def list_data_blocks(filepath: str) -> List[str]: """ List available data blocks in a CIF file. Parameters ---------- filepath : str Path to the CIF file. Returns ------- list of str Names of available data blocks. Examples -------- :: blocks = cif.list_data_blocks('multi-dataset.cif') print(f"Available blocks: {blocks}") """ reader = CIFReader(filepath) return reader.available_blocks
[docs] def write_map(data, cell, filepath: str, spacegroup: str = "P1") -> int: """ Write a 3D numpy array or torch tensor to a CCP4 map file. Parameters ---------- data : numpy.ndarray or torch.Tensor 3D array of map data. cell : list, numpy.ndarray, or torch.Tensor Unit cell parameters [a, b, c, alpha, beta, gamma] in A and degrees. filepath : str Output CCP4 filename. spacegroup : str, optional Space group symbol. Default is 'P1'. Returns ------- int Returns 1 on success. """ import gemmi if isinstance(data, torch.Tensor): np_map = data.detach().cpu().numpy().astype(np.float32) else: np_map = data.astype(np.float32) if isinstance(cell, torch.Tensor): cell = cell.detach().cpu().numpy().tolist() elif isinstance(cell, np.ndarray): cell = cell.tolist() map_ccp = gemmi.Ccp4Map() map_ccp.grid = gemmi.FloatGrid( np_map, gemmi.UnitCell(*cell), gemmi.find_spacegroup_by_name(spacegroup) ) map_ccp.setup(0.0) map_ccp.update_ccp4_header() map_ccp.write_ccp4_map(filepath) return 1
[docs] def dataframe_to_gemmi_structure(df, cell, spacegroup): """Convert a torchref atom DataFrame to a gemmi.Structure. Parameters ---------- df : pandas.DataFrame Atom DataFrame with standard torchref columns (ATOM, serial, name, altloc, resname, chainid, resseq, icode, x, y, z, occupancy, tempfactor, element, charge, anisou_flag, u11..u23). cell : list or numpy.ndarray Unit cell parameters [a, b, c, alpha, beta, gamma]. spacegroup : str Space group name (Hermann-Mauguin notation). Returns ------- gemmi.Structure The constructed gemmi Structure object. """ import gemmi st = gemmi.Structure() st.name = "torchref" if cell is not None: if isinstance(cell, (list, tuple)): st.cell = gemmi.UnitCell(*cell) else: st.cell = gemmi.UnitCell(*cell.tolist()) if spacegroup: st.spacegroup_hm = str(spacegroup) model = gemmi.Model("1") # Group by chain, then by (resseq, icode, resname) for residues for chain_id, chain_group in df.groupby("chainid", sort=False): chain = gemmi.Chain(str(chain_id) if chain_id and str(chain_id) != "nan" else "A") for (resseq, icode, resname), res_group in chain_group.groupby( ["resseq", "icode", "resname"], sort=False ): residue = gemmi.Residue() residue.name = str(resname).strip() seq_str = str(int(resseq)) icode_str = str(icode).strip() if icode and str(icode) not in ("nan", " ") else "" residue.seqid = gemmi.SeqId(seq_str + icode_str) # Set het flag based on ATOM/HETATM first_atom_type = res_group.iloc[0]["ATOM"] if str(first_atom_type).strip() == "HETATM": residue.het_flag = "H" else: residue.het_flag = "A" for _, row in res_group.iterrows(): atom = gemmi.Atom() atom.name = str(row["name"]).strip() elem_str = str(row["element"]).strip() if elem_str and elem_str != "nan": atom.element = gemmi.Element(elem_str) atom.pos = gemmi.Position( float(row["x"]), float(row["y"]), float(row["z"]) ) atom.occ = round(float(row["occupancy"]), 2) atom.b_iso = round(float(row["tempfactor"]), 2) altloc = str(row["altloc"]).strip() if altloc and altloc != "nan" and altloc != ".": atom.altloc = altloc[0] charge = row.get("charge", 0) if charge and float(charge) != 0: atom.charge = int(float(charge)) # Anisotropic displacement parameters if row.get("anisou_flag", False): u11 = float(row.get("u11", 0)) u22 = float(row.get("u22", 0)) u33 = float(row.get("u33", 0)) u12 = float(row.get("u12", 0)) u13 = float(row.get("u13", 0)) u23 = float(row.get("u23", 0)) atom.aniso = gemmi.SMat33f(u11, u22, u33, u12, u13, u23) residue.add_atom(atom) chain.add_residue(residue) model.add_chain(chain) st.add_model(model) # Populate PDBx label columns from structure topology st.setup_entities() st.assign_subchains() # Build entity sequences from residue names so label_seq_id can be assigned for entity in st.entities: if entity.entity_type == gemmi.EntityType.Polymer: for subchain_name in entity.subchains: subchain = st[0].get_subchain(subchain_name) entity.full_sequence = [res.name for res in subchain] break # one subchain is enough st.assign_label_seq_id() return st
def _add_refine_categories(doc, metadata): """Inject refinement metadata into an mmCIF document. Parameters ---------- doc : gemmi.cif.Document The mmCIF document to modify. metadata : RefinementMetadata Metadata to inject. """ import gemmi block = doc.sole_block() cats = metadata.render_cif_categories() for cat_name, items in cats.items(): # Check if any values are lists (loop categories) is_loop = any(isinstance(v, list) for v in items.values()) if is_loop: # Create loop: gemmi expects prefix (e.g. '_audit_author.') # and tag suffixes (e.g. ['name', 'pdbx_ordinal']) tags = list(items.keys()) prefix = tags[0].rsplit(".", 1)[0] + "." suffixes = [t.split(".")[-1] for t in tags] loop = block.init_loop(prefix, suffixes) # All list values should have same length n_rows = max(len(v) for v in items.values() if isinstance(v, list)) for i in range(n_rows): row = [] for tag in tags: val = items[tag] if isinstance(val, list): row.append(str(val[i]) if i < len(val) else "?") else: row.append(str(val)) loop.add_row(row) else: # Simple key-value pairs for key, val in items.items(): block.set_pair(key, gemmi.cif.quote(str(val)))
[docs] def write_model(df, filepath: str, metadata=None) -> None: """Write atomic coordinates to mmCIF file. Parameters ---------- df : pandas.DataFrame Atom DataFrame with standard torchref columns. filepath : str Output mmCIF file path. metadata : RefinementMetadata, optional Metadata to include in the mmCIF file (refinement statistics, title, authors, etc.). """ import gemmi cell = df.attrs.get("cell") spacegroup = df.attrs.get("spacegroup", "P 1") # Build metadata block first, then structure block, so metadata # appears before atom records in the output file. if metadata is not None: # Create a standalone document with metadata meta_doc = gemmi.cif.Document() meta_block = meta_doc.add_new_block("torchref") # Add cell and symmetry to metadata block if cell is not None: if not isinstance(cell, (list, tuple)): cell = cell.tolist() meta_block.set_pair("_cell.length_a", str(cell[0])) meta_block.set_pair("_cell.length_b", str(cell[1])) meta_block.set_pair("_cell.length_c", str(cell[2])) meta_block.set_pair("_cell.angle_alpha", str(cell[3])) meta_block.set_pair("_cell.angle_beta", str(cell[4])) meta_block.set_pair("_cell.angle_gamma", str(cell[5])) if spacegroup: meta_block.set_pair( "_symmetry.space_group_name_H-M", gemmi.cif.quote(str(spacegroup)) ) # Add refinement metadata categories _add_refine_categories(meta_doc, metadata) # Now build structure and get its atom_site loop st = dataframe_to_gemmi_structure(df, cell, spacegroup) struct_doc = st.make_mmcif_document() struct_block = struct_doc.sole_block() # Copy atom-related loops from struct_block to meta_block for item in struct_block: if item.loop is not None: loop = item.loop tags = list(loop.tags) suffixes = [t.split(".")[-1] for t in tags] prefix = tags[0].rsplit(".", 1)[0] + "." new_loop = meta_block.init_loop(prefix, suffixes) for row_idx in range(loop.length()): row = [loop[row_idx, col] for col in range(loop.width())] new_loop.add_row(row) elif item.pair is not None: tag, val = item.pair # Skip cell/symmetry - already added if not tag.startswith(("_cell.", "_symmetry.")): meta_block.set_pair(tag, val) meta_doc.write_file(filepath) else: st = dataframe_to_gemmi_structure(df, cell, spacegroup) doc = st.make_mmcif_document() doc.write_file(filepath)
# Convenience aliases read = read_reflections # Default read is for reflections