Source code for CoREMOF.get_mofid

"""Process your CIF to get mofid v1 and v2.
"""

from ase.io import read as ase_read
from ase.io import write as ase_write
from ase import neighborlist
import networkx as nx
from ase.build import sort
import ase
import os, glob, shutil
import numpy as np
import collections

from pymatgen.core.structure import Structure
from pymatgen.core.lattice import Lattice

from mofid.id_constructor import extract_fragments
from collections import Counter
from mofid.run_mofid import cif2mofid

from pymatgen.analysis.structure_matcher import ElementComparator, StructureMatcher
import selfies as sf

from openbabel import openbabel as ob


metals  = ['Li', 'Na', 'K', 'Rb', 'Cs', 'Fr', 'Be', 'Mg', 'Ca', 'Sr', 'Ba', 'Ra',
          'Al', 'Ga', 'Ge', 'In', 'Sn', 'Sb', 'Tl', 'Pb', 'Bi', 'Po',
          'Sc', 'Ti', 'V' , 'Cr', 'Mn', 'Fe', 'Co', 'Ni', 'Cu', 'Zn',
          'Y' , 'Zr', 'Nb', 'Mo', 'Tc', 'Ru', 'Rh', 'Pd', 'Ag', 'Cd',
          'Hf', 'Ta', 'W' , 'Re', 'Os', 'Ir', 'Pt', 'Au', 'Hg',
          'La', 'Ce', 'Pr', 'Nd', 'Pm', 'Sm', 'Eu', 'Gd', 'Tb', 'Dy', 'Ho', 'Er', 'U', 'Tm', 'Yb', 'Lu',
          'Ac', 'Th', 'Pa', 'U', 'Np', 'Pu', 'Am', 'Cm', 'Bk', 'Cf', 'Es', 'Fm', 'Md', 'No', 'Lr'
         ]


[docs]
def run_v1(structure):

    """Converting CIF to mofid-v1, see https://snurr-group.github.io/mofid/ for additional installation. Tip: Please check CMAKE, JAVA, etc. before installing.

    Args:
        structure (str): path to your CIF.

    Returns:
        String:
            -   mofid-v1.
    """

    mofid_v1 = cif2mofid(structure)
    return mofid_v1 # mofid_v1['mofid'], mofid_v1['smiles_nodes'], mofid_v1['smiles_linkers'], mofid_v1['topology'], mofid_v1['cat']



[docs]
def dict2str(dct):
    """Convert symbol-to-number dict to str.
    """
    return ''.join(symb + (str(n)) for symb, n in dct.items())



[docs]
def split_nodes_from_cif(structure, prefix='Output'):
    """Split nodes (CIF) to single XYZ.

    Args:
        structure (str): path to your CIF.
        prefix (str): the path to save processed XYZ.

    Returns:
        int:
            -   0: sucess; 1: fail.
    """
    try:
        atoms = ase_read(structure)
    except:
        print('Error with reading CIF in {}'.format(structure))
        return 1
    cutOff = neighborlist.natural_cutoffs(atoms)
    neighborList = neighborlist.NeighborList(cutOff, self_interaction=False, bothways=True, skin=0.3)
    neighborList.update(atoms)
    G = nx.Graph()
    for k in range(len(atoms)):
        tup = (k, {"element":"{}".format(atoms.get_chemical_symbols()[k]), "pos": atoms.get_positions()[k]})
        G.add_nodes_from([tup])
    for k in range(len(atoms)):
        for i in neighborList.get_neighbors(k)[0]:
            G.add_edge(k, i)
    Gcc = sorted(nx.connected_components(G), key=len, reverse=True)
    form_dicts = []
    for index, g in enumerate(Gcc):
        g = list(g)
        fragment = atoms[g]
        fragment = sort(fragment)
        form_dict = fragment.symbols.formula.count()
        form_dicts.append(dict2str(form_dict))
    #nodes = [atoms[list(Gcc[0])]]
    #unique_formdicts = [form_dicts[0]]
    nodes = []
    unique_formdicts = []
    if len(form_dicts) > 1:
        for index, form_dict in enumerate(form_dicts):
            if form_dict not in unique_formdicts:
                nodes.append(atoms[list(Gcc[index])])
                unique_formdicts.append(form_dict)
    elif len(form_dicts) == 1:
        nodes.append(atoms[list(Gcc[0])])
        unique_formdicts.append(form_dicts[0])
    for index, _ in enumerate(nodes):
        ase_write('{}/node{}.xyz'.format(prefix, index), nodes[index]) # /AllNode/nodes.cif
    return 0



[docs]
def split_linkers_from_cif(structure, prefix='Output'):
    """Split linkers (CIF) to single XYZ.

    Args:
        structure (str): path to your CIF.
        prefix (str): the path to save processed XYZ.

    Returns:
        int:
            -   0: sucess; 1: fail.
    """
    try:
        atoms = ase_read(structure)
    except:
        print('Error with reading CIF in {}'.format(structure))
        return 1
    cutOff = neighborlist.natural_cutoffs(atoms)
    neighborList = neighborlist.NeighborList(cutOff, self_interaction=False, bothways=True, skin=0.3)
    neighborList.update(atoms)
    G = nx.Graph()
    for k in range(len(atoms)):
        tup = (k, {"element":"{}".format(atoms.get_chemical_symbols()[k]), "pos": atoms.get_positions()[k]})
        G.add_nodes_from([tup])
    for k in range(len(atoms)):
        for i in neighborList.get_neighbors(k)[0]:
            G.add_edge(k, i)
    Gcc = sorted(nx.connected_components(G), key=len, reverse=True)
    form_dicts = []
    for index, g in enumerate(Gcc):
        g = list(g)
        fragment = atoms[g]
        fragment = sort(fragment)
        form_dict = fragment.symbols.formula.count()
        form_dicts.append(dict2str(form_dict))
    nodes = []
    unique_formdicts = []
    if len(form_dicts) > 1:
        for index, form_dict in enumerate(form_dicts):
            if form_dict not in unique_formdicts:
                nodes.append(atoms[list(Gcc[index])])
                unique_formdicts.append(form_dict)
    elif len(form_dicts) == 1:
        nodes.append(atoms[list(Gcc[0])])
        unique_formdicts.append(form_dicts[0])
    for index, _ in enumerate(nodes):
        ase_write('{}_MetalOxolinker{}.xyz'.format(prefix, index), nodes[index]) # /MetalOxo/linkers.cif
    return 0



[docs]
def get_node_linker_files(structure, prefix='Output'):
    """Split MOF to sbu + linker.

    Args:
        structure (str): path to your CIF.
        prefix (str): the path to save processed XYZ.

    Returns:
        files:
            -   structure, node, linker, ...
    """
    os.makedirs(prefix, exist_ok=True)
    extract_fragments(structure, prefix)



[docs]
def xyz2fomula(xyzpath):
    """from XYZ to chemical fomula based on A-Z.

    Args:
        xyzpath (str): path to your XYZ.

    Returns:
        str:
            -   fomula.
    """
    atoms = ase_read(xyzpath)
    symbols = atoms.get_chemical_symbols()
    counts = Counter(symbols)
    sorted_items = sorted(counts.items(), key=lambda x: x[0])
    # formula = ''.join(f"{el}{n if n > 1 else ''}" for el, n in sorted_items)
    formula = ''.join(f"{el}{n}" for el, n in sorted_items)
    return formula



[docs]
def convert_ase_pymat(ase_objects):
    """convert to ase to pymatgen atoms.

    Args:
        ase_objects (ase.Atoms): ase-type atoms.

    Returns:
        ase.Atoms:
            -   pymatgen-type atoms.
    """
    structure_lattice = Lattice(ase_objects.cell)
    structure_species = ase_objects.get_chemical_symbols()
    structure_positions = ase_objects.get_positions()
    return Structure(structure_lattice,structure_species,structure_positions)



[docs]
def remove_pbc_cuts(atoms):
    """Remove building block cuts due to periodic boundary conditions. After the
    removal, the atoms object is centered at the center of the unit cell.

    Args:
        atoms (ase.Atoms): aase.Atoms.

    Returns:
        ase.Atoms:
            -   The processed atoms object.
    """

    try:
        # setting cuttoff parameter
        scale = 1.4
        cutoffs = ase.neighborlist.natural_cutoffs(atoms)
        cutoffs = [scale * c for c in cutoffs]
        #making neighbor_list
        #graphs of single metal is not constructed because there is no neighbor.
        I, J, D = ase.neighborlist.neighbor_list("ijD",atoms,cutoff=cutoffs)
        nl = [[] for _ in atoms]
        for i, j, d in zip(I, J, D):
            nl[i].append((j, d))
        visited = [False for _ in atoms]
        q = collections.deque()
        # Center of the unit cell.
        abc_half = np.sum(atoms.get_cell(), axis=0) * 0.5
        positions = {}
        q.append((0, np.array([0.0, 0.0, 0.0])))
        while q:
            i, pos = q.pop()
            visited[i] = True
            positions[i] = pos
            for j, d in nl[i]:
                if not visited[j]:
                    q.append((j, pos + d))
                    visited[j] = True
        centroid = np.array([0.0, 0.0, 0.0])
        for v in positions.values():
            centroid += v
        centroid /= len(positions)
        syms = [None for _ in atoms]
        poss = [None for _ in atoms]
        for i in range(len(atoms)):
            syms[i] = atoms.symbols[i]
            poss[i] = positions[i] - centroid + abc_half
        atoms = ase.Atoms(
            symbols=syms, positions=poss, pbc=True, cell=atoms.get_cell()
        )
        # resize of cell
        cell_x = np.max(atoms.positions[:,0]) - np.min(atoms.positions[:,0])
        cell_y = np.max(atoms.positions[:,1]) - np.min(atoms.positions[:,1])
        cell_z = np.max(atoms.positions[:,2]) - np.min(atoms.positions[:,2])
        cell = max([cell_x,cell_y,cell_z])
        atoms.set_cell([cell+2,cell+2,cell+2, 90,90,90])
        center_mass = atoms.get_center_of_mass()
        cell_half  = atoms.cell.cellpar()[0:3]/2
        atoms.positions = atoms.positions - center_mass + cell_half    
        return atoms
    except:
        return atoms



[docs]
def run_v2(structure, nodes_dataset, refname):
    """run mofidv2 from CIF.

    Args:
        structure (str): path to your MOF.
        nodes_dataset (str): path to your node dataset (download from https://github.com/sxm13/CoREMOF_tools/tree/main/CoREMOF/data/mofid/nodes.zip).
        refname (str): file name of your structure or define by yourself.

    Returns:
        str:
            -   mofid-v2.
    """
    # get list of nodes
    try:
        shutil.rmtree("Output")
    except:
        pass
    # get list of nodes
    nodes_type = glob.glob(nodes_dataset+"/*xyz")
    nodes = []
    for node_file in nodes_type:
        nodes.append(node_file.split("/")[-1].split("_")[0])
    nodes = list(set(nodes))
    # get information of mofid-v1
    mofidv1 = run_v1(structure)
    linkers = mofidv1["smiles_linkers"]
    all_linkers = []
    for linker in linkers:
        all_linkers.append(sf.encoder(linker))
    topology = mofidv1["topology"]
    cat = mofidv1["cat"]
    # get_node_linker_files(cifpath)
    check = split_nodes_from_cif("Output/AllNode/nodes.cif", "Output")
    if check == 1:
        print("nan")
        return "nan"
    else:
        all_nodes_xyz = glob.glob("./Output/node*xyz")
        all_nodes_part = []
        for node_xyz in all_nodes_xyz:
            node_formula = xyz2fomula(node_xyz)
            if node_formula in nodes:
                matcher = StructureMatcher(ltol = 0.3,
                                           stol = 2,
                                           angle_tol = 5,
                                           primitive_cell = False,
                                           scale = False,
                                           comparator=ElementComparator()) 
                matched = False
                known_nodes = glob.glob(nodes_dataset+ "/" + node_formula + "*xyz")
                
                mof = remove_pbc_cuts(ase_read(node_xyz))
                a = convert_ase_pymat(mof)
                for i in range(len(known_nodes)):
                    b = convert_ase_pymat(remove_pbc_cuts(ase_read(known_nodes[i])))
                    if matcher.fit(a, b):
                        node_part = os.path.basename(known_nodes[i].replace(".xyz", ""))
                        all_nodes_part.append(node_part)
                        matched = True
                        print(node_formula, "the node can be found in nodes dataset")
                        break
                if not matched:
                    # raise RuntimeError(f"fail matched from nodes dataset, stop")
                    node_part = node_formula + "_Type-" + str(len(known_nodes) + 1)
                    all_nodes_part.append(node_part)
                    shutil.move(node_xyz, nodes_dataset + "/" + node_part + ".xyz")
                    print("new node found, has moved the nodes dataset")
            else:
                node_part = node_formula + "_Type-1"
                all_nodes_part.append(node_part)
                shutil.move(node_xyz, nodes_dataset + "/" + node_part + ".xyz")
                print("new node found, has moved the nodes dataset")

    linkers_part = ".".join(all_linkers)
    nodes_part = ".".join(f"[{node}]" for node in all_nodes_part)
    mofidv2 = nodes_part + "." + linkers_part + " " + "MOFid-v2." + topology + ".cat" + cat + ";" + refname
    try:
        shutil.rmtree("Output")
    except:
        pass
    return mofidv2



[docs]
def are_identical_smiles(smiles1, smiles2):
    obConversion = ob.OBConversion()
    obConversion.SetInFormat("smi")
    obConversion.SetOutFormat("can")  # Set output to canonical SMILES
    mol1 = ob.OBMol()
    mol2 = ob.OBMol()
    # Read the SMILES strings into molecules
    obConversion.ReadString(mol1, smiles1)
    obConversion.ReadString(mol2, smiles2)
    # Convert molecules to canonical SMILES
    can_smiles1 = obConversion.WriteString(mol1).strip()
    can_smiles2 = obConversion.WriteString(mol2).strip()
    # Compare canonical SMILES
    return can_smiles1 == can_smiles2