Source code for coflandscaper._internal.dft

"""Generate and parse CRYSTAL23 inputs/outputs for COF workflows.

This module provides helpers for converting CIF files to CRYSTAL `.d12`
inputs, extracting converged energies from CRYSTAL `.out` files, and
recovering optimized structures for downstream analysis.
"""

from __future__ import annotations

import re
from pathlib import Path
from typing import TYPE_CHECKING

import numpy as np
import pandas as pd
from ase.data import atomic_numbers

if TYPE_CHECKING:
    from ase.atoms import Atoms

HARTREE_TO_EV = 27.211386245988



[docs]
def guess_symbol(raw: str) -> str | None:
    """Guess a chemical symbol from a CIF-like raw token.

    Args:
        raw: Raw symbol or label text.

    Returns:
        Normalized element symbol if recognized, else `None`.
    """
    s = re.sub(r"[^A-Za-z]", "", raw)
    if not s:
        return None
    s = s[0].upper() + s[1:].lower()
    if s in atomic_numbers:
        return s
    if len(s) > 1 and s[:2] in atomic_numbers:
        return s[:2]
    if s[0] in atomic_numbers:
        return s[0]
    return None




[docs]
def parse_cell(text: str) -> dict[str, float]:
    """Parse unit-cell lengths and angles from CIF text.

    Args:
        text: Full CIF text.

    Returns:
        Dictionary with keys `a`, `b`, `c`, `alpha`, `beta`, and `gamma`.

    Raises:
        ValueError: If any required cell parameter is missing.
    """

    def grab(key: str) -> float:
        m = re.search(rf"{key}\s+([0-9.+\-Ee()]+)", text)
        if not m:
            raise ValueError(f"Missing cell parameter: {key}")
        return float(re.sub(r"\([^)]*\)", "", m.group(1)))

    return {
        "a": grab(r"_cell_length_a"),
        "b": grab(r"_cell_length_b"),
        "c": grab(r"_cell_length_c"),
        "alpha": grab(r"_cell_angle_alpha"),
        "beta": grab(r"_cell_angle_beta"),
        "gamma": grab(r"_cell_angle_gamma"),
    }




[docs]
def extract_atoms(lines: list[str]) -> list[tuple[int, float, float, float]]:
    """Extract fractional atom sites from a CIF atom loop block.

    Args:
        lines: CIF file lines.

    Returns:
        List of tuples `(Z, x, y, z)` with atomic number and fractional coords.

    Raises:
        ValueError: If no supported fractional-coordinate atom loop is found.
    """
    for i, line in enumerate(lines):
        if line.strip().lower().startswith("loop_"):
            j = i + 1
            headers: list[str] = []
            while j < len(lines) and lines[j].strip().startswith("_"):
                headers.append(lines[j].strip())
                j += 1
            if not any("fract_x" in h.lower() for h in headers):
                continue

            hdr = {h.split()[0].lower(): idx for idx, h in enumerate(headers)}
            xk = next(k for k in hdr if "_fract_x" in k)
            yk = next(k for k in hdr if "_fract_y" in k)
            zk = next(k for k in hdr if "_fract_z" in k)
            lbl = next(
                (k for k in hdr if "_type_symbol" in k or "_label" in k), None
            )

            atoms: list[tuple[int, float, float, float]] = []
            while j < len(lines):
                s = lines[j].strip()
                if not s or s.startswith(("loop_", "_")):
                    break
                parts = lines[j].split()
                if len(parts) >= len(headers):
                    lab = parts[hdr[lbl]] if lbl else "X"
                    sym = guess_symbol(lab) or "X"
                    Z = atomic_numbers.get(sym, -1)
                    x = float(re.sub(r"\([^)]*\)", "", parts[hdr[xk]]))
                    y = float(re.sub(r"\([^)]*\)", "", parts[hdr[yk]]))
                    z = float(re.sub(r"\([^)]*\)", "", parts[hdr[zk]]))
                    atoms.append((Z, x, y, z))
                j += 1
            return atoms
    raise ValueError("No atom site loop with fractional coordinates.")




[docs]
def find_last_occurrence(lines: list[str], keyword: str) -> int:
    """Return the index of the last occurrence of a keyword.

    Args:
        lines: Text lines from a CRYSTAL output.
        keyword: Substring to search for.

    Returns:
        Index of the last matching line, or -1 if not found.
    """
    for i in range(len(lines) - 1, -1, -1):
        if keyword in lines[i]:
            return i
    return -1




[docs]
def parse_atom_lines(
    lines: list[str], start_idx: int
) -> tuple[list[str], list[list[float]]]:
    """Parse atom symbols and XYZ coordinates from a line block.

    Args:
        lines: Text lines from a CRYSTAL output.
        start_idx: Index where the atom block begins.

    Returns:
        Tuple of (symbols, coordinates), where coordinates are raw XYZ values.
    """
    symbols: list[str] = []
    raw_coords: list[list[float]] = []

    for i in range(start_idx, len(lines)):
        parts = lines[i].split()
        if len(parts) < 4:
            if symbols:
                break
            continue

        try:
            sym = parts[-4]
            if sym.capitalize() not in atomic_numbers:
                if symbols:
                    break
                continue

            x, y, z = float(parts[-3]), float(parts[-2]), float(parts[-1])
            symbols.append(sym)
            raw_coords.append([x, y, z])
        except (ValueError, IndexError):
            if symbols:
                break
            continue

    return symbols, raw_coords



def _parse_primary_structure(lines: list[str]) -> Atoms | None:
    """Parse a structure using DIRECT LATTICE and PRIMITIVE CELL blocks.

    Args:
        lines: Text lines from a CRYSTAL output.

    Returns:
        ASE Atoms if parsing succeeds, otherwise None.
    """
    lat_idx = find_last_occurrence(lines, "DIRECT LATTICE")
    prim_idx = find_last_occurrence(lines, "PRIMITIVE CELL")

    if lat_idx == -1 and prim_idx == -1:
        return None

    try:
        cell = None
        if lat_idx != -1:
            v_start = lat_idx + 2
            cell = [
                list(map(float, lines[v_start].split())),
                list(map(float, lines[v_start + 1].split())),
                list(map(float, lines[v_start + 2].split())),
            ]

        symbols: list[str] = []
        positions: list[list[float]] = []
        if prim_idx != -1:
            symbols, positions = parse_atom_lines(lines, prim_idx + 4)

        if not symbols and cell is None:
            return None

        try:
            from ase import Atoms
        except Exception as exc:
            raise ModuleNotFoundError(
                "ase is required for CRYSTAL structure extraction. Install with: pip install ase"
            ) from exc

        return Atoms(
            symbols=symbols,
            cell=cell,
            positions=positions,
            pbc=(cell is not None),
        )
    except (ValueError, IndexError, KeyError):
        return None


def _parse_fallback_structure(lines: list[str]) -> Atoms | None:
    """Parse a structure using LATTICE PARAMETERS and ATOM blocks.

    Args:
        lines: Text lines from a CRYSTAL output.

    Returns:
        ASE Atoms if parsing succeeds, otherwise None.
    """
    lat_header_sig = "LATTICE PARAMETERS (ANGSTROMS AND DEGREES)"
    lat_idx = find_last_occurrence(lines, lat_header_sig)

    if lat_idx == -1:
        return None

    try:
        param_line_idx = -1
        for i in range(lat_idx, min(lat_idx + 10, len(lines))):
            if "ALPHA" in lines[i] and "BETA" in lines[i]:
                param_line_idx = i + 1
                break

        if param_line_idx == -1 or param_line_idx >= len(lines):
            return None

        params = list(map(float, lines[param_line_idx].split()))
        if len(params) != 6:
            return None

        try:
            from ase.geometry import cellpar_to_cell
        except Exception as exc:
            raise ModuleNotFoundError(
                "ase is required for CRYSTAL structure extraction. Install with: pip install ase"
            ) from exc

        cell = cellpar_to_cell(params)

        atom_header_idx = -1
        for i in range(param_line_idx, len(lines)):
            if "ATOM" in lines[i] and "Z" in lines[i]:
                atom_header_idx = i
                break
        if atom_header_idx == -1:
            return None

        header = lines[atom_header_idx]
        x_is_frac = "/A" in header or "/a" in header
        y_is_frac = "/B" in header or "/b" in header
        z_is_frac = "/C" in header or "/c" in header

        symbols, raw_coords = parse_atom_lines(lines, atom_header_idx + 2)
        if not symbols:
            return None

        final_positions: list[list[float]] = []
        cell_mat = np.array(cell, dtype=float)

        for x, y, z in raw_coords:
            if x_is_frac and y_is_frac and z_is_frac:
                pos = np.dot([x, y, z], cell_mat)
            elif (not x_is_frac) and (not y_is_frac) and (not z_is_frac):
                pos = [x, y, z]
            else:
                frac_vec = [
                    x if x_is_frac else 0.0,
                    y if y_is_frac else 0.0,
                    z if z_is_frac else 0.0,
                ]
                cart_vec = [
                    x if not x_is_frac else 0.0,
                    y if not y_is_frac else 0.0,
                    z if not z_is_frac else 0.0,
                ]
                part_a = np.dot(frac_vec, cell_mat)
                pos = [float(part_a[k] + cart_vec[k]) for k in range(3)]
            final_positions.append(
                [float(pos[0]), float(pos[1]), float(pos[2])]
            )

        try:
            from ase import Atoms
        except Exception as exc:
            raise ModuleNotFoundError(
                "ase is required for CRYSTAL structure extraction. Install with: pip install ase"
            ) from exc

        return Atoms(
            symbols=symbols, positions=final_positions, cell=cell, pbc=True
        )
    except Exception:
        return None



[docs]
def parse_z_L_from_stem(stem: str) -> tuple[float, float]:
    """Parse `_z` and `_L` tags from a structure stem.

    Args:
        stem: Structure filename stem.

    Returns:
        Tuple `(z, L)` in Angstrom, or `(nan, nan)` when tags are missing.
    """
    mz = re.search(r"_z(\d+)", stem)
    mL = re.search(r"_L(\d+)", stem)
    if not (mz and mL):
        return np.nan, np.nan
    z = float(mz.group(1)) / 10.0
    L = float(mL.group(1)) / 10.0
    return z, L




[docs]
class Crystal:
    """Base class for converting CIF structures into CRYSTAL `.d12` inputs.

    Returns:
        None.
    """

    def __init__(self, post_block: str) -> None:
        """Initialize a CRYSTAL input generator.

        Args:
            post_block: Text appended to each generated .d12 file.
                This is where BASISSET/DFT/SHRINK or OPTGEOM blocks are injected.
        """
        self._post_block = post_block

    def _convert_one(
        self, cif_path: Path, output_path: Path | None = None
    ) -> Path:
        """Convert one CIF file into one CRYSTAL `.d12` file.

        Args:
            cif_path: Input CIF path.
            output_path: Optional output `.d12` path. Defaults to `None`
                (writes next to input with `.d12` suffix).

        Returns:
            Path to the generated `.d12` file.
        """
        txt = cif_path.read_text(errors="ignore")
        lines = txt.splitlines()
        cell = parse_cell(txt)
        atoms = extract_atoms(lines)

        title = cif_path.stem
        out_path = output_path or cif_path.with_suffix(".d12")

        out: list[str] = []
        out.append(title)
        out.append("CRYSTAL")
        out.append("0 0 0")
        out.append("1")  # P1 symmetry
        out.append(
            f"{cell['a']:.6f} {cell['b']:.6f} {cell['c']:.6f} "
            f"{cell['alpha']:.6f} {cell['beta']:.6f} {cell['gamma']:.6f}"
        )
        out.append(str(len(atoms)))
        for Z, x, y, z in atoms:
            if Z < 0:
                out.append(f"0 {x:.9f} {y:.9f} {z:.9f}")
            else:
                out.append(f"{Z} {x:.9f} {y:.9f} {z:.9f}")
        if self._post_block.strip():
            out.append(self._post_block.strip())

        out_path.write_text("\n".join(out) + "\n")
        return out_path


[docs]
    def run(
        self,
        input_folder: str,
        output_folder: str | None = None,
    ) -> None:
        """Convert all CIF files in a folder to .d12.

        Args:
            input_folder: Folder containing .cif files.
            output_folder: Optional output folder for `.d12` files.
                Defaults to `None` (writes under `input_folder`).
        """
        in_path = Path(input_folder)
        if not in_path.exists():
            raise FileNotFoundError(f"Input folder not found: {in_path}")

        out_path = Path(output_folder) if output_folder else in_path
        out_path.mkdir(parents=True, exist_ok=True)

        for cif in sorted(in_path.glob("*.cif")):
            try:
                subdir = out_path / cif.stem
                subdir.mkdir(parents=True, exist_ok=True)
                target = subdir / (cif.stem + ".d12")
                self._convert_one(cif, output_path=target)
            except Exception:
                continue



[docs]
    def generate_input(
        self,
        cof_name: str,
        mode: str,
        input_base_folder: str | None = None,
        output_base_folder: str | None = None,
    ) -> None:
        """Convert CIFs for selected stacking modes into CRYSTAL .d12 inputs.

        Args:
            cof_name: COF name used for folder naming.
            mode: Mode selector. Allowed values are `"incl"`, `"serr"`,
                or `"both"`.
            input_base_folder: Optional base folder containing mode subfolders.
                Defaults to `None` (uses `{cof_name}/2_{cof_name}_matrix`).
            output_base_folder: Optional base folder for outputs.
                Defaults to `None` (uses `{cof_name}/2_{cof_name}_matrix`).

        Notes:
            Outputs are written to dft_{serr|incl} subfolders.
        """
        mode_lower = mode.lower()
        if mode_lower not in {"incl", "serr", "both"}:
            raise ValueError("mode must be 'incl', 'serr', or 'both'.")

        mode_tags = ["serr", "incl"] if mode_lower == "both" else [mode_lower]
        input_base_folder_used = (
            input_base_folder or f"{cof_name}/2_{cof_name}_matrix"
        )
        output_base_folder_used = (
            output_base_folder or f"{cof_name}/2_{cof_name}_matrix"
        )

        for mode_tag in mode_tags:
            self.run(
                input_folder=f"{input_base_folder_used}/{mode_tag}",
                output_folder=f"{output_base_folder_used}/dft_{mode_tag}",
            )





[docs]
class CrystalSP(Crystal):
    """CRYSTAL single-point input and output parser utilities.

    Returns:
        None.
    """

    def _extract_energy_au(self, text: str) -> float | None:
        """Extract final converged total energy from CRYSTAL output text.

        Args:
            text: Full `.out` text content.

        Returns:
            Converged energy in Hartree, or `None` when not found.
        """
        lines = text.splitlines()
        if len(lines) < 2:
            return None
        if "TELAPSE" not in lines[-2]:
            return None

        energy_label = "TOTAL ENERGY + DISP + GCP (AU)"
        num_re = re.compile(r"([-+]?(\d+(\.\d*)?|\.\d+)([eE][-+]?\d+)?)")
        last_val: float | None = None

        for i, line in enumerate(lines):
            if energy_label in line:
                tail = line.split(energy_label, 1)[1]
                m = num_re.search(tail)
                if m:
                    last_val = float(m.group(1))
                elif i + 1 < len(lines):
                    m2 = num_re.search(lines[i + 1])
                    if m2:
                        last_val = float(m2.group(1))

        return last_val

    def __init__(
        self,
        basisset: str = "SOLDEF2MSVP",
        functional: str = "HSESOL3C",
        shrink: str = "2 2 8",
        post_block: str | None = None,
    ) -> None:
        """Initialize a CRYSTAL single-point input generator.

        Args:
            basisset: CRYSTAL basis set name. Defaults to `"SOLDEF2MSVP"`.
            functional: CRYSTAL functional name. Defaults to `"HSESOL3C"`.
            shrink: SHRINK line values. Defaults to `"2 2 8"`.
            post_block: Optional override for the full CRYSTAL input tail.
                Defaults to `None` (auto-generates BASISSET/DFT/SHRINK block).
        """
        if post_block is None:
            post_block = f"""BASISSET
{basisset}
DFT
{functional}
END
SHRINK
0 8
{shrink}
END"""
        super().__init__(post_block=post_block)


[docs]
    def read(
        self,
        input_folder: str,
        output_csv_dir: str | None = None,
        output_filename_suffix: str = "",
    ) -> Path:
        """Extract converged energies from CRYSTAL .out files.

        Args:
            input_folder: Folder containing CRYSTAL output files.
            output_csv_dir: Optional output folder for the CSV.
                Defaults to `None` (uses `{cof_name}/3_{cof_name}_landscape`).
            output_filename_suffix: Optional suffix appended to the default
                CSV filename stem (before `.csv`), e.g. `"_dft"`.
                Defaults to `""`.

        Returns:
            Path to the energies CSV.
        """
        input_path = Path(input_folder)
        folder_tag = input_path.name
        mode_tag = (
            folder_tag.replace("dft_", "")
            if folder_tag.startswith("dft_")
            else folder_tag
        )

        cof_name = folder_tag
        if input_path.parent.name.endswith("_matrix"):
            cof_name = input_path.parents[1].name
        elif input_path.parent.name:
            cof_name = input_path.parent.name

        out_files: list[Path] = []
        for out_path in sorted(input_path.rglob("*.out")):
            if out_path.name.lower().startswith("slurm"):
                continue
            if out_path.parent == input_path:
                out_files.append(out_path)
                continue
            if out_path.stem == out_path.parent.name:
                out_files.append(out_path)

        if not out_files:
            raise FileNotFoundError(
                f"No valid .out files found in: {input_path.resolve()} (expected system_name.out)"
            )

        csv_dir = Path(output_csv_dir or f"{cof_name}/3_{cof_name}_landscape")
        csv_dir.mkdir(parents=True, exist_ok=True)

        energies_csv_path = (
            csv_dir
            / f"{cof_name}_sp_energies_{mode_tag}{output_filename_suffix}.csv"
        )

        rows = []
        failed = []
        for out_path in out_files:
            try:
                text = out_path.read_text(errors="ignore")
                energy_au = self._extract_energy_au(text)
                if energy_au is None:
                    raise ValueError("Energy not found in output")
                energy_ev = energy_au * HARTREE_TO_EV
                z, L = parse_z_L_from_stem(out_path.stem)
                rows.append(
                    {
                        "structure": out_path.stem,
                        "z": z,
                        "L": L,
                        "energy_eV": energy_ev,
                    }
                )
            except Exception as exc:
                failed.append((str(out_path), repr(exc)))

        df = pd.DataFrame(rows).sort_values("structure").reset_index(drop=True)
        if not df.empty:
            min_e = float(df["energy_eV"].min())
            df["energy_rel_eV"] = df["energy_eV"] - min_e
        df.to_csv(energies_csv_path, index=False)

        if failed:
            print("\nFailed files (first 10):")
            for p, err in failed[:10]:
                print(" -", p)
                print("   ", err)
            print("Total failed:", len(failed))

        return energies_csv_path



[docs]
    def read_output(
        self,
        cof_name: str,
        mode: str,
        output_base_folder: str | None = None,
        input_base_folder: str | None = None,
    ) -> list[Path]:
        """Extract CRYSTAL single-point energies for selected mode(s).

        Args:
            cof_name: COF name used for folder naming.
            mode: Mode selector. Allowed values are `"incl"`, `"serr"`,
                or `"both"`.
            output_base_folder: Optional output folder for CSVs.
                Defaults to `None` (uses `{cof_name}/3_{cof_name}_landscape`).
                Default filenames include a `_dft` suffix.
            input_base_folder: Optional base folder containing dft_{mode} subfolders.
                Defaults to `None` (uses `{cof_name}/2_{cof_name}_matrix`).

        Returns:
            List of CSV paths written.
        """
        from .ild_ils_utils import get_mode_folders

        input_base_used = (
            input_base_folder or f"{cof_name}/2_{cof_name}_matrix"
        )
        csv_paths: list[Path] = []
        for folder in get_mode_folders(cof_name, mode):
            mode_tag = Path(folder).name
            csv_paths.append(
                self.read(
                    input_folder=f"{input_base_used}/dft_{mode_tag}",
                    output_csv_dir=output_base_folder,
                    output_filename_suffix="_dft",
                )
            )
        return csv_paths





[docs]
class CrystalOpt(Crystal):
    """CRYSTAL geometry-optimization input/output utilities.

    Returns:
        None.
    """

    def _extract_energy_au(self, text: str) -> float | None:
        """Extract final converged total energy from CRYSTAL output text.

        Args:
            text: Full `.out` text content.

        Returns:
            Converged energy in Hartree, or `None` when not found.
        """
        lines = text.splitlines()
        if len(lines) < 2:
            return None
        if "TELAPSE" not in lines[-2]:
            return None

        energy_label = "TOTAL ENERGY + DISP + GCP (AU)"
        num_re = re.compile(r"([-+]?(\d+(\.\d*)?|\.\d+)([eE][-+]?\d+)?)")
        last_val: float | None = None

        for i, line in enumerate(lines):
            if energy_label in line:
                tail = line.split(energy_label, 1)[1]
                m = num_re.search(tail)
                if m:
                    last_val = float(m.group(1))
                elif i + 1 < len(lines):
                    m2 = num_re.search(lines[i + 1])
                    if m2:
                        last_val = float(m2.group(1))

        return last_val

    def __init__(
        self,
        basisset: str = "SOLDEF2MSVP",
        functional: str = "HSESOL3C",
        shrink: str = "2 2 8",
        maxtradius: str = "0.5",
        post_block: str | None = None,
    ) -> None:
        """Initialize a CRYSTAL geometry-optimization input generator.

        Args:
            basisset: CRYSTAL basis set name. Defaults to `"SOLDEF2MSVP"`.
            functional: CRYSTAL functional name. Defaults to `"HSESOL3C"`.
            shrink: SHRINK line values. Defaults to `"2 2 8"`.
            maxtradius: MAXTRADIUS value for OPTGEOM. Defaults to `"0.5"`.
            post_block: Optional override for the full CRYSTAL input tail.
                Defaults to `None` (auto-generates OPTGEOM and DFT blocks).
        """
        if post_block is None:
            post_block = f"""OPTGEOM
MAXTRADIUS
{maxtradius}
ENDOPT
BASISSET
{basisset}
DFT
{functional}
END
SHRINK
0 8
{shrink}
END"""
        super().__init__(post_block=post_block)


[docs]
    def generate_input(
        self,
        cof_name: str,
        mode: str,
        input_base_folder: str | None = None,
        output_base_folder: str | None = None,
    ) -> None:
        """Generate CRYSTAL geometry-optimization inputs for selected mode(s).

        Args:
            cof_name: COF name used for folder naming.
            mode: Mode selector. Allowed values are `"incl"`, `"serr"`,
                or `"both"`.
            input_base_folder: Optional base folder containing per-mode input subfolders.
                Defaults to `None`
                (uses `{cof_name}/3_{cof_name}_landscape/selection`).
            output_base_folder: Optional base folder for outputs (relative to cof_name).
                Outputs are written to dft_{mode} subfolders under this base.
                Defaults to `None` (uses `{cof_name}/4_{cof_name}_optimization`).
        """
        from .ild_ils_utils import get_mode_folders

        if input_base_folder is None:
            input_base_folder = f"{cof_name}/3_{cof_name}_landscape/selection"
        if output_base_folder is None:
            output_base_folder = f"{cof_name}/4_{cof_name}_optimization"

        for folder in get_mode_folders(cof_name, mode):
            mode_tag = Path(folder).name
            self.run(
                input_folder=f"{input_base_folder}/{mode_tag}",
                output_folder=f"{output_base_folder}/dft_{mode_tag}",
            )



[docs]
    def read(
        self,
        input_folder: str,
        output_csv_dir: str | None = None,
    ) -> Path:
        """Extract converged energies from CRYSTAL .out files.

        Args:
            input_folder: Folder containing CRYSTAL output files.
            output_csv_dir: Optional output folder for the CSV.
                Defaults to `None` (uses `{cof_name}/4_{cof_name}_optimization`).

        Returns:
            Path to the energies CSV.
        """
        input_path = Path(input_folder)
        folder_tag = input_path.name
        mode_tag = (
            folder_tag.replace("dft_", "")
            if folder_tag.startswith("dft_")
            else folder_tag
        )

        cof_name = folder_tag
        if input_path.parent.name.endswith(
            ("_final_structures", "_optimization")
        ):
            cof_name = input_path.parents[1].name
        elif input_path.parent.name:
            cof_name = input_path.parent.name

        out_files = self._collect_out_files([input_path])

        if not out_files:
            raise FileNotFoundError(
                f"No valid .out files found in: {input_path.resolve()} (expected system_name.out)"
            )

        csv_dir = Path(
            output_csv_dir or f"{cof_name}/4_{cof_name}_optimization"
        )
        csv_dir.mkdir(parents=True, exist_ok=True)

        energies_csv_path = (
            csv_dir / f"{cof_name}_opt_energies_per_layer_dft.csv"
        )

        rows: list[dict[str, str | float]] = []
        failed = []
        for out_path in out_files:
            try:
                text = out_path.read_text(errors="ignore")
                energy_au = self._extract_energy_au(text)
                if energy_au is None:
                    raise ValueError("Energy not found or not converged")
                energy_ev = energy_au * HARTREE_TO_EV
                energy_ev_per_layer = (
                    energy_ev / 2.0 if mode_tag == "serr" else energy_ev
                )
                rows.append(
                    {
                        "structure": out_path.stem,
                        "stacking_mode": mode_tag,
                        "energy_eV_per_layer": energy_ev_per_layer,
                    }
                )
            except Exception as exc:
                failed.append((str(out_path), repr(exc)))

        df = pd.DataFrame(rows).sort_values("structure").reset_index(drop=True)
        if not df.empty:
            min_e = float(df["energy_eV_per_layer"].min())
            df["energy_rel_eV_per_layer"] = df["energy_eV_per_layer"] - min_e
        df.to_csv(energies_csv_path, index=False)

        if failed:
            print("\nFailed files (first 10):")
            for p, err in failed[:10]:
                print(" -", p)
                print("   ", err)
            print("Total failed:", len(failed))

        return energies_csv_path


    def _collect_out_files(self, input_paths: list[Path]) -> list[Path]:
        """Collect valid CRYSTAL .out files from one or more folders.

        Args:
            input_paths: List of folders to search.

        Returns:
            Sorted list of valid .out files.
        """
        out_files: list[Path] = []
        for input_path in input_paths:
            for out_path in sorted(input_path.rglob("*.out")):
                if out_path.name.lower().startswith("slurm"):
                    continue
                if out_path.parent == input_path:
                    out_files.append(out_path)
                    continue
                if out_path.stem == out_path.parent.name:
                    out_files.append(out_path)
        return out_files


[docs]
    def read_output(
        self,
        cof_name: str,
        mode: str,
        output_base_folder: str | None = None,
        input_base_folder: str | None = None,
    ) -> list[Path]:
        """Extract optimization energies for selected mode(s) into one CSV.

        Args:
            cof_name: COF name used for folder naming.
            mode: Mode selector. Allowed values are `"incl"`, `"serr"`,
                or `"both"`.
            output_base_folder: Optional output folder for the CSVs.
                Defaults to `None` (uses `{cof_name}/4_{cof_name}_optimization`).
            input_base_folder: Optional base folder containing dft_{mode} subfolders.
                Defaults to `None` (uses `{cof_name}/4_{cof_name}_optimization`).

        Returns:
            List containing the combined CSV path.

        Notes:
            Output file name is {cof_name}_opt_energies_per_layer_dft.csv.
            Serrated energies are reported per layer (E/2).
        """
        from .ild_ils_utils import get_mode_folders

        if input_base_folder is None:
            input_base_folder = f"{cof_name}/4_{cof_name}_optimization"
        mode_tags = [
            Path(folder).name for folder in get_mode_folders(cof_name, mode)
        ]
        input_paths = [
            Path(f"{input_base_folder}/dft_{mode_tag}")
            for mode_tag in mode_tags
        ]
        out_files = self._collect_out_files(input_paths)
        if not out_files:
            raise FileNotFoundError(
                f"No valid .out files found in: {input_base_folder} (expected system_name.out)"
            )

        csv_dir = Path(
            output_base_folder or f"{cof_name}/4_{cof_name}_optimization"
        )
        csv_dir.mkdir(parents=True, exist_ok=True)
        energies_csv_path = (
            csv_dir / f"{cof_name}_opt_energies_per_layer_dft.csv"
        )

        rows: list[dict[str, str | float]] = []
        failed = []
        for out_path in out_files:
            try:
                text = out_path.read_text(errors="ignore")
                energy_au = self._extract_energy_au(text)
                if energy_au is None:
                    raise ValueError("Energy not found or not converged")
                energy_ev = energy_au * HARTREE_TO_EV
                mode_from_parent = out_path.parent.name
                if mode_from_parent.startswith("dft_"):
                    mode_from_parent = mode_from_parent.replace("dft_", "")
                if mode_from_parent not in {"serr", "incl"}:
                    mode_from_parent = (
                        "serr" if out_path.stem.endswith("_serr") else "incl"
                    )

                energy_ev_per_layer = (
                    energy_ev / 2.0
                    if mode_from_parent == "serr"
                    else energy_ev
                )
                rows.append(
                    {
                        "structure": out_path.stem,
                        "stacking_mode": mode_from_parent,
                        "energy_eV_per_layer": energy_ev_per_layer,
                    }
                )
            except Exception as exc:
                failed.append((str(out_path), repr(exc)))

        df_new = pd.DataFrame(rows)

        # Merge with existing CSV so rerunning a single mode updates only that
        # mode and preserves rows for other modes.
        if energies_csv_path.exists():
            try:
                df_existing = pd.read_csv(energies_csv_path)
            except Exception:
                df_existing = pd.DataFrame()
        else:
            df_existing = pd.DataFrame()

        if not df_existing.empty and "stacking_mode" in df_existing.columns:
            df_existing = df_existing[
                ~df_existing["stacking_mode"].isin(mode_tags)
            ]

        if df_existing.empty:
            df = df_new
        elif df_new.empty:
            df = df_existing
        else:
            df = pd.concat([df_existing, df_new], ignore_index=True)

        if not df.empty and {"stacking_mode", "structure"}.issubset(
            df.columns
        ):
            df = df.sort_values(["stacking_mode", "structure"]).reset_index(
                drop=True
            )
        if not df.empty:
            min_e = float(df["energy_eV_per_layer"].min())
            df["energy_rel_eV_per_layer"] = df["energy_eV_per_layer"] - min_e
        df.to_csv(energies_csv_path, index=False)

        if failed:
            print("\nFailed files (first 10):")
            for p, err in failed[:10]:
                print(" -", p)
                print("   ", err)
            print("Total failed:", len(failed))

        return [energies_csv_path]



[docs]
    def extract_structures(
        self,
        input_folder: str,
        output_folder: str | None = None,
    ) -> list[Path]:
        """Extract the final optimized structures to CIF files.

        Args:
            input_folder: Folder containing CRYSTAL output files.
            output_folder: Optional output folder for CIFs. Defaults to `None`
                (writes next to parsed output files).

        Returns:
            List of CIF paths written.
        """
        input_path = Path(input_folder)
        if not input_path.exists():
            raise FileNotFoundError(f"Input folder not found: {input_path}")

        out_files: list[Path] = []
        for out_path in sorted(input_path.rglob("*.out")):
            if out_path.name.lower().startswith("slurm"):
                continue
            if out_path.parent == input_path:
                out_files.append(out_path)
                continue
            if out_path.stem == out_path.parent.name:
                out_files.append(out_path)

        if not out_files:
            raise FileNotFoundError(
                f"No valid .out files found in: {input_path.resolve()} (expected system_name.out)"
            )

        try:
            from ase.io import write
        except Exception as exc:
            raise ModuleNotFoundError(
                "ase is required for CRYSTAL structure extraction. Install with: pip install ase"
            ) from exc

        outputs: list[Path] = []
        for out_path in out_files:
            text = out_path.read_text(errors="ignore")
            lines = text.splitlines()
            structure = _parse_primary_structure(lines)
            if structure is None:
                structure = _parse_fallback_structure(lines)
            if structure is None:
                raise RuntimeError(
                    f"Could not parse structure from: {out_path}"
                )

            out_dir = Path(output_folder) if output_folder else out_path.parent
            out_dir.mkdir(parents=True, exist_ok=True)

            out_cif = out_dir / f"{out_path.stem}.cif"
            write(str(out_cif), structure)
            outputs.append(out_cif)

        return outputs



[docs]
    def extract_cif(
        self,
        cof_name: str,
        mode: str,
        output_base_folder: str | None = None,
        input_base_folder: str | None = None,
    ) -> list[Path]:
        """Extract optimized CIFs from CRYSTAL outputs for selected mode(s).

        Args:
            cof_name: COF name used for folder naming.
            mode: Mode selector. Allowed values are `"incl"`, `"serr"`,
                or `"both"`.
            output_base_folder: Optional base folder for CIF outputs.
                Defaults to `None` (uses `{cof_name}/4_{cof_name}_optimization`).
                CIFs are written to `dft_{mode}` subfolders under this base.
            input_base_folder: Optional base folder containing dft_{mode} subfolders.
                Defaults to `None` (uses `{cof_name}/4_{cof_name}_optimization`).

        Notes:
            Defaults for both input and output bases are
            {cof_name}/4_{cof_name}_optimization.

        Returns:
            List of CIF paths written.
        """
        from .ild_ils_utils import get_mode_folders

        if input_base_folder is None:
            input_base_folder = f"{cof_name}/4_{cof_name}_optimization"
        if output_base_folder is None:
            output_base_folder = input_base_folder

        outputs: list[Path] = []
        for folder in get_mode_folders(cof_name, mode):
            mode_tag = Path(folder).name
            outputs.extend(
                self.extract_structures(
                    input_folder=f"{input_base_folder}/dft_{mode_tag}",
                    output_folder=f"{output_base_folder}/dft_{mode_tag}",
                )
            )
        return outputs