Source code for torchref.io.cif

"""
CIF/mmCIF file format reading and writing.

This module provides functions for reading and writing CIF (Crystallographic
Information File) format data, including:
- Reflection data (structure factors)
- Model data (atomic coordinates)
- Restraint dictionaries
- Electron density maps

Functions
---------
read_reflections
    Read reflection data from a CIF file.
read_model
    Read atomic coordinates from a CIF file.
read_restraints
    Read restraint dictionary from a CIF file.
list_data_blocks
    List available data blocks in a CIF file.
write_map
    Write electron density map to CCP4 format.
write_model
    Write atomic coordinates to mmCIF format.

Classes
-------
CIFReader
    Generic CIF file reader.
ReflectionCIFReader
    Reader for structure factor CIF files.
ModelCIFReader
    Reader for mmCIF coordinate files.
RestraintCIFReader
    Reader for restraint dictionary CIF files.

Examples
--------
::

    from torchref.io import cif

    # Reading reflections
    reader = cif.read_reflections('structure-sf.cif', verbose=1)
    data_dict, cell, spacegroup = reader()

    # Reading model
    reader = cif.read_model('structure.cif')
    df, cell, spacegroup = reader()

    # Writing model (mmCIF)
    from torchref.io.metadata import RefinementMetadata
    meta = RefinementMetadata(r_work=0.18, r_free=0.21)
    cif.write_model(df, 'refined.cif', metadata=meta)

    # Reading restraints
    reader = cif.read_restraints('ALA.cif')
    restraints = reader.get_all_restraints()

    # List data blocks
    blocks = cif.list_data_blocks('multi-block.cif')
"""

from typing import List, Optional

import numpy as np
import torch

# Import all CIF reader classes from the existing module
from torchref.io.cif_readers import (
    CIFReader,
    ModelCIFReader,
    ReflectionCIFReader,
    RestraintCIFReader,
)



[docs]
def read_reflections(
    filepath: str, data_block: Optional[str] = None, verbose: int = 0
) -> ReflectionCIFReader:
    """
    Read reflection data from a CIF file.

    Parameters
    ----------
    filepath : str
        Path to the structure factor CIF file.
    data_block : str, optional
        Name of the data block to read. If None, uses the first block.
    verbose : int, optional
        Verbosity level. Default is 0.

    Returns
    -------
    ReflectionCIFReader
        Reader object with data loaded.

    Examples
    --------
    ::

        reader = cif.read_reflections('structure-sf.cif')
        data_dict, cell, spacegroup = reader()
        hkl = data_dict['HKL']
    """
    return ReflectionCIFReader(filepath, verbose=verbose, data_block=data_block)




[docs]
def read_model(filepath: str, verbose: int = 0) -> ModelCIFReader:
    """
    Read atomic coordinates from a CIF file.

    Parameters
    ----------
    filepath : str
        Path to the mmCIF coordinate file.
    verbose : int, optional
        Verbosity level. Default is 0.

    Returns
    -------
    ModelCIFReader
        Reader object with data loaded.

    Examples
    --------
    ::

        reader = cif.read_model('structure.cif')
        df, cell, spacegroup = reader()
        print(f"Loaded {len(df)} atoms")
    """
    return ModelCIFReader(filepath, verbose=verbose)




[docs]
def read_restraints(filepath: str) -> RestraintCIFReader:
    """
    Read restraint dictionary from a CIF file.

    Parameters
    ----------
    filepath : str
        Path to the restraint dictionary CIF file.

    Returns
    -------
    RestraintCIFReader
        Reader object with restraints loaded.

    Examples
    --------
    ::

        reader = cif.read_restraints('ALA.cif')
        restraints = reader.get_all_restraints()
    """
    return RestraintCIFReader(filepath)




[docs]
def list_data_blocks(filepath: str) -> List[str]:
    """
    List available data blocks in a CIF file.

    Parameters
    ----------
    filepath : str
        Path to the CIF file.

    Returns
    -------
    list of str
        Names of available data blocks.

    Examples
    --------
    ::

        blocks = cif.list_data_blocks('multi-dataset.cif')
        print(f"Available blocks: {blocks}")
    """
    reader = CIFReader(filepath)
    return reader.available_blocks




[docs]
def write_map(data, cell, filepath: str, spacegroup: str = "P1") -> int:
    """
    Write a 3D numpy array or torch tensor to a CCP4 map file.

    Parameters
    ----------
    data : numpy.ndarray or torch.Tensor
        3D array of map data.
    cell : list, numpy.ndarray, or torch.Tensor
        Unit cell parameters [a, b, c, alpha, beta, gamma] in A and degrees.
    filepath : str
        Output CCP4 filename.
    spacegroup : str, optional
        Space group symbol. Default is 'P1'.

    Returns
    -------
    int
        Returns 1 on success.
    """
    import gemmi

    if isinstance(data, torch.Tensor):
        np_map = data.detach().cpu().numpy().astype(np.float32)
    else:
        np_map = data.astype(np.float32)

    if isinstance(cell, torch.Tensor):
        cell = cell.detach().cpu().numpy().tolist()
    elif isinstance(cell, np.ndarray):
        cell = cell.tolist()

    map_ccp = gemmi.Ccp4Map()
    map_ccp.grid = gemmi.FloatGrid(
        np_map, gemmi.UnitCell(*cell), gemmi.find_spacegroup_by_name(spacegroup)
    )
    map_ccp.setup(0.0)
    map_ccp.update_ccp4_header()
    map_ccp.write_ccp4_map(filepath)

    return 1




[docs]
def dataframe_to_gemmi_structure(df, cell, spacegroup):
    """Convert a torchref atom DataFrame to a gemmi.Structure.

    Parameters
    ----------
    df : pandas.DataFrame
        Atom DataFrame with standard torchref columns (ATOM, serial, name,
        altloc, resname, chainid, resseq, icode, x, y, z, occupancy,
        tempfactor, element, charge, anisou_flag, u11..u23).
    cell : list or numpy.ndarray
        Unit cell parameters [a, b, c, alpha, beta, gamma].
    spacegroup : str
        Space group name (Hermann-Mauguin notation).

    Returns
    -------
    gemmi.Structure
        The constructed gemmi Structure object.
    """
    import gemmi

    st = gemmi.Structure()
    st.name = "torchref"

    if cell is not None:
        if isinstance(cell, (list, tuple)):
            st.cell = gemmi.UnitCell(*cell)
        else:
            st.cell = gemmi.UnitCell(*cell.tolist())

    if spacegroup:
        st.spacegroup_hm = str(spacegroup)

    model = gemmi.Model("1")

    # Group by chain, then by (resseq, icode, resname) for residues
    for chain_id, chain_group in df.groupby("chainid", sort=False):
        chain = gemmi.Chain(str(chain_id) if chain_id and str(chain_id) != "nan" else "A")

        for (resseq, icode, resname), res_group in chain_group.groupby(
            ["resseq", "icode", "resname"], sort=False
        ):
            residue = gemmi.Residue()
            residue.name = str(resname).strip()
            seq_str = str(int(resseq))
            icode_str = str(icode).strip() if icode and str(icode) not in ("nan", " ") else ""
            residue.seqid = gemmi.SeqId(seq_str + icode_str)

            # Set het flag based on ATOM/HETATM
            first_atom_type = res_group.iloc[0]["ATOM"]
            if str(first_atom_type).strip() == "HETATM":
                residue.het_flag = "H"
            else:
                residue.het_flag = "A"

            for _, row in res_group.iterrows():
                atom = gemmi.Atom()
                atom.name = str(row["name"]).strip()

                elem_str = str(row["element"]).strip()
                if elem_str and elem_str != "nan":
                    atom.element = gemmi.Element(elem_str)

                atom.pos = gemmi.Position(
                    float(row["x"]), float(row["y"]), float(row["z"])
                )
                atom.occ = round(float(row["occupancy"]), 2)
                atom.b_iso = round(float(row["tempfactor"]), 2)

                altloc = str(row["altloc"]).strip()
                if altloc and altloc != "nan" and altloc != ".":
                    atom.altloc = altloc[0]

                charge = row.get("charge", 0)
                if charge and float(charge) != 0:
                    atom.charge = int(float(charge))

                # Anisotropic displacement parameters
                if row.get("anisou_flag", False):
                    u11 = float(row.get("u11", 0))
                    u22 = float(row.get("u22", 0))
                    u33 = float(row.get("u33", 0))
                    u12 = float(row.get("u12", 0))
                    u13 = float(row.get("u13", 0))
                    u23 = float(row.get("u23", 0))
                    atom.aniso = gemmi.SMat33f(u11, u22, u33, u12, u13, u23)

                residue.add_atom(atom)

            chain.add_residue(residue)

        model.add_chain(chain)

    st.add_model(model)

    # Populate PDBx label columns from structure topology
    st.setup_entities()
    st.assign_subchains()

    # Build entity sequences from residue names so label_seq_id can be assigned
    for entity in st.entities:
        if entity.entity_type == gemmi.EntityType.Polymer:
            for subchain_name in entity.subchains:
                subchain = st[0].get_subchain(subchain_name)
                entity.full_sequence = [res.name for res in subchain]
                break  # one subchain is enough

    st.assign_label_seq_id()

    return st



def _add_refine_categories(doc, metadata):
    """Inject refinement metadata into an mmCIF document.

    Parameters
    ----------
    doc : gemmi.cif.Document
        The mmCIF document to modify.
    metadata : RefinementMetadata
        Metadata to inject.
    """
    import gemmi

    block = doc.sole_block()
    cats = metadata.render_cif_categories()

    for cat_name, items in cats.items():
        # Check if any values are lists (loop categories)
        is_loop = any(isinstance(v, list) for v in items.values())

        if is_loop:
            # Create loop: gemmi expects prefix (e.g. '_audit_author.')
            # and tag suffixes (e.g. ['name', 'pdbx_ordinal'])
            tags = list(items.keys())
            prefix = tags[0].rsplit(".", 1)[0] + "."
            suffixes = [t.split(".")[-1] for t in tags]
            loop = block.init_loop(prefix, suffixes)
            # All list values should have same length
            n_rows = max(len(v) for v in items.values() if isinstance(v, list))
            for i in range(n_rows):
                row = []
                for tag in tags:
                    val = items[tag]
                    if isinstance(val, list):
                        row.append(str(val[i]) if i < len(val) else "?")
                    else:
                        row.append(str(val))
                loop.add_row(row)
        else:
            # Simple key-value pairs
            for key, val in items.items():
                block.set_pair(key, gemmi.cif.quote(str(val)))



[docs]
def write_model(df, filepath: str, metadata=None) -> None:
    """Write atomic coordinates to mmCIF file.

    Parameters
    ----------
    df : pandas.DataFrame
        Atom DataFrame with standard torchref columns.
    filepath : str
        Output mmCIF file path.
    metadata : RefinementMetadata, optional
        Metadata to include in the mmCIF file (refinement statistics,
        title, authors, etc.).
    """
    import gemmi

    cell = df.attrs.get("cell")
    spacegroup = df.attrs.get("spacegroup", "P 1")

    # Build metadata block first, then structure block, so metadata
    # appears before atom records in the output file.
    if metadata is not None:
        # Create a standalone document with metadata
        meta_doc = gemmi.cif.Document()
        meta_block = meta_doc.add_new_block("torchref")

        # Add cell and symmetry to metadata block
        if cell is not None:
            if not isinstance(cell, (list, tuple)):
                cell = cell.tolist()
            meta_block.set_pair("_cell.length_a", str(cell[0]))
            meta_block.set_pair("_cell.length_b", str(cell[1]))
            meta_block.set_pair("_cell.length_c", str(cell[2]))
            meta_block.set_pair("_cell.angle_alpha", str(cell[3]))
            meta_block.set_pair("_cell.angle_beta", str(cell[4]))
            meta_block.set_pair("_cell.angle_gamma", str(cell[5]))

        if spacegroup:
            meta_block.set_pair(
                "_symmetry.space_group_name_H-M", gemmi.cif.quote(str(spacegroup))
            )

        # Add refinement metadata categories
        _add_refine_categories(meta_doc, metadata)

        # Now build structure and get its atom_site loop
        st = dataframe_to_gemmi_structure(df, cell, spacegroup)
        struct_doc = st.make_mmcif_document()
        struct_block = struct_doc.sole_block()

        # Copy atom-related loops from struct_block to meta_block
        for item in struct_block:
            if item.loop is not None:
                loop = item.loop
                tags = list(loop.tags)
                suffixes = [t.split(".")[-1] for t in tags]
                prefix = tags[0].rsplit(".", 1)[0] + "."
                new_loop = meta_block.init_loop(prefix, suffixes)
                for row_idx in range(loop.length()):
                    row = [loop[row_idx, col] for col in range(loop.width())]
                    new_loop.add_row(row)
            elif item.pair is not None:
                tag, val = item.pair
                # Skip cell/symmetry - already added
                if not tag.startswith(("_cell.", "_symmetry.")):
                    meta_block.set_pair(tag, val)

        meta_doc.write_file(filepath)
    else:
        st = dataframe_to_gemmi_structure(df, cell, spacegroup)
        doc = st.make_mmcif_document()
        doc.write_file(filepath)



# Convenience aliases
read = read_reflections  # Default read is for reflections