Source code for CoREMOF.models.cp_app.featurizer

# -*- coding: utf-8 -*-

"""The methods to featurize a dataset of porous materials for heat capacity."""

import string
from pathlib import Path
import numpy as np
import pandas as pd
from typing import List
from pymatgen.io.cif import CifParser
from matminer.featurizers.site import GaussianSymmFunc, SiteElementalProperty,AGNIFingerprints 
from matminer.featurizers.base import BaseFeaturizer
from matminer.utils.data import MagpieData
from pymatgen.analysis.local_env import VoronoiNN


[docs] def featurize_dataset(cifs: list, verbos=False, saveto: str="features.csv")-> pd.DataFrame: """Featurize crystal structures using elemetal, geometric, and chemical descriptors for local environments. Args: cifs: list of paths to crystal structure in cif format. verbos: printing the steps. saveto: filename to save the generated features. """ features={} for cif in cifs: structure = CifParser(cif).get_structures()[0] structure_name = Path(cif).name features[structure_name]={} features[structure_name]["structure"]=structure features[structure_name]["structure_name"]=Path(cif).name features[structure_name]["structure_path"]=str(Path(cif).parent) data=pd.DataFrame.from_dict(features).T features_dict={} ## 1. initialize the dictionary for each site for index,row in data.iterrows(): structure=row["structure"] for atomidx in range(structure.num_sites): site_name="%s_%i"%(index,atomidx) features_dict[site_name]={} features_dict[site_name]={"structure_name": row["structure_name"]} features_dict[site_name].update({"structure_path": row["structure_path"]}) ## 2. Site Elemental Property print("site elemental properties") property_list=("Number","AtomicWeight","Row","Column","Electronegativity","CovalentRadius") SEP = SiteElementalProperty(properties=property_list) colnames=SEP._generate_column_labels(multiindex=False,return_errors=False) for index,row in data.iterrows(): structure=row["structure"] if verbos: print(index) for atomidx in range(structure.num_sites): feat=SEP.featurize(structure,idx=atomidx) site_name="%s_%i"%(index,atomidx) features_dict[site_name].update(dict(zip(colnames, feat))) ## 3. AGNI print("AGNI") property_list=("Number","AtomicWeight","Row","Column","Electronegativity","CovalentRadius") AGNI = AGNIFingerprints(cutoff=5,directions=[None]) colnames=AGNI._generate_column_labels(multiindex=False,return_errors=False) for index,row in data.iterrows(): structure=row["structure"] if verbos: print(index) for atomidx in range(structure.num_sites): feat=AGNI.featurize(structure,idx=atomidx) site_name="%s_%i"%(index,atomidx) features_dict[site_name].update(dict(zip(colnames, feat))) ## 4. Gaussian Symmetry Functions print("GSF") GSF = GaussianSymmFunc(cutoff=5) colnames=GSF._generate_column_labels(multiindex=False,return_errors=False) for index,row in data.iterrows(): structure=row["structure"] if verbos: print(index) for atomidx in range(structure.num_sites): feat=GSF.featurize(structure,idx=atomidx) site_name="%s_%i"%(index,atomidx) features_dict[site_name].update(dict(zip(colnames, feat))) ## 5. site difference stats print("LPD") LPD = LocalPropertyStatsNew(properties=property_list) colnames=LPD._generate_column_labels(multiindex=False,return_errors=False) for index,row in data.iterrows(): structure=row["structure"] if verbos: print(index) for atomidx in range(structure.num_sites): feat=LPD.featurize(structure,idx=atomidx) site_name="%s_%i"%(index,atomidx) features_dict[site_name].update(dict(zip(colnames, feat))) df_features=pd.DataFrame.from_dict(features_dict).T if saveto: df_features.to_csv(saveto) return df_features
[docs] def featurize_structure(cif: string, verbos=False, saveto: str="features.csv")-> pd.DataFrame: """Featurize a crystal structure using elemetal, geometric, and chemical descriptors for local environments. Args: cifs: list of paths to crystal structure in cif format. verbos: printing the steps. saveto: filename to save the generated features. """ structure = CifParser(cif).get_structures()[0] structure_name = Path(cif).name features = {structure_name:{}} features[structure_name]["structure"]=structure features[structure_name]["structure_name"]=Path(cif).name features[structure_name]["structure_path"]=str(Path(cif).parent) data=pd.DataFrame.from_dict(features).T features_dict={} ## 1. initialize the dictionary for each site for index,row in data.iterrows(): structure=row["structure"] for atomidx in range(structure.num_sites): site_name="%s_%i"%(index,atomidx) features_dict[site_name]={"structure_name": row["structure_name"]} features_dict[site_name].update({"structure_path": row["structure_path"]}) ## 2. Site Elemental Property property_list=("Number","AtomicWeight","Row","Column","Electronegativity","CovalentRadius") SEP = SiteElementalProperty(properties=property_list) colnames=SEP._generate_column_labels(multiindex=False,return_errors=False) for index,row in data.iterrows(): structure=row["structure"] if verbos: print(index) for atomidx in range(structure.num_sites): feat=SEP.featurize(structure,idx=atomidx) site_name="%s_%i"%(index,atomidx) features_dict[site_name].update(dict(zip(colnames, feat))) ## 3. AGNI property_list=("Number","AtomicWeight","Row","Column","Electronegativity","CovalentRadius") AGNI = AGNIFingerprints(cutoff=5,directions=[None]) colnames=AGNI._generate_column_labels(multiindex=False,return_errors=False) for index,row in data.iterrows(): structure=row["structure"] if verbos: print(index) for atomidx in range(structure.num_sites): feat=AGNI.featurize(structure,idx=atomidx) site_name="%s_%i"%(index,atomidx) features_dict[site_name].update(dict(zip(colnames, feat))) ## 4. Gaussian Symmetry Functions GSF = GaussianSymmFunc(cutoff=5) colnames=GSF._generate_column_labels(multiindex=False,return_errors=False) for index,row in data.iterrows(): structure=row["structure"] if verbos: print(index) for atomidx in range(structure.num_sites): feat=GSF.featurize(structure,idx=atomidx) site_name="%s_%i"%(index,atomidx) features_dict[site_name].update(dict(zip(colnames, feat))) ## 5. site difference stats LPD = LocalPropertyStatsNew(properties=property_list) colnames=LPD._generate_column_labels(multiindex=False,return_errors=False) for index,row in data.iterrows(): structure=row["structure"] if verbos: print(index) for atomidx in range(structure.num_sites): feat=LPD.featurize(structure,idx=atomidx) site_name="%s_%i"%(index,atomidx) features_dict[site_name].update(dict(zip(colnames, feat))) df_features=pd.DataFrame.from_dict(features_dict).T if saveto: df_features.to_csv(saveto) return df_features
[docs] class LocalPropertyStatsNew(BaseFeaturizer): """ Differences, minima and maxima in elemental properties between site and its neighboring sites. Uses the Voronoi tessellation of the structure to determine the neighbors of the site, and assigns each neighbor (:math:`n`) a weight (:math:`A_n`) that corresponds to the area of the facet on the tessellation corresponding to that neighbor. The local property difference is then computed by :math:`\\frac{\sum_n {A_n |p_n - p_0|}}{\sum_n {A_n}}` where :math:`p_n` is the property (e.g., atomic number) of a neighbor and :math:`p_0` is the property of a site. If signed parameter is assigned True, signed difference of the properties is returned instead of absolute difference. Taken from oximachine featurizer: https://github.com/kjappelbaum/oximachine_featurizer Features: - "local property stat in [property]" References: - `Ward et al. _PRB_ 2017 <http://link.aps.org/doi/10.1103/PhysRevB.96.024104>`_ """ def __init__( self, data_source=MagpieData(), weight: str = "area", properties: List[str] = ("Electronegativity",), cutoff: List[str] = 5, ): """Initialize the featurizer. Args: data_source (AbstractData): Class from which to retrieve elemental properties. weight (str): What aspect of each voronoi facet to use to weigh each neighbor (see VoronoiNN). properties (List[str]): List of properties to use. cutoff (float): cut off value. """ self.data_source = data_source self.properties = properties self.weight = weight self.cutoff = cutoff
[docs] @staticmethod def from_preset(preset: str, cutoff: float = 13): """ Create a new LocalPropertyStats class according to a preset. Args: preset (str): Name of preset cutoff (float): Cutoff for the nearest neighbor search """ if preset == "interpretable": return LocalPropertyStatsNew( data_source=MagpieData(), properties=[ "MendeleevNumber", "Column", "Row", "Electronegativity", "NsValence", "NpValence", "NdValence", "NfValence", "NValence", "NsUnfilled", "NpUnfilled", "NdUnfilled", "NfUnfilled", "NUnfilled", "GSbandgap", ], cutoff=cutoff, ) else: raise ValueError("Unrecognized preset: " + preset)
[docs] def featurize(self, strc, idx: int): # Get the targeted site my_site = strc[idx] # Get the tessellation of a site nn = VoronoiNN( weight=self.weight, tol=0.0, cutoff=self.cutoff, compute_adj_neighbors=False, ).get_nn_info(strc, idx) # Get the element and weight of each site elems = [n["site"].specie for n in nn] weights = [n["weight"] for n in nn] # Compute the difference for each property output = np.zeros((len(self.properties),)) output_signed = np.zeros((len(self.properties),)) output_max = np.zeros((len(self.properties),)) output_min = np.zeros((len(self.properties),)) total_weight = np.sum(weights) for i, p in enumerate(self.properties): my_prop = self.data_source.get_elemental_property(my_site.specie, p) n_props = self.data_source.get_elemental_properties(elems, p) output[i] = ( np.dot(weights, np.abs(np.subtract(n_props, my_prop))) / total_weight ) output_signed[i] = ( np.dot(weights, np.subtract(n_props, my_prop)) / total_weight ) output_max[i] = np.max(np.subtract(n_props, my_prop)) output_min[i] = np.min(np.subtract(n_props, my_prop)) return np.hstack([output, output_signed, output_max, output_min])
[docs] def feature_labels(self): return ( ["local difference in " + p for p in self.properties] + ["local signed difference in " + p for p in self.properties] + ["maximum local difference in " + p for p in self.properties] + ["minimum local difference in " + p for p in self.properties] )
[docs] def citations(self): return [ "@article{Ward2017," "author = {Ward, Logan and Liu, Ruoqian " "and Krishna, Amar and Hegde, Vinay I. " "and Agrawal, Ankit and Choudhary, Alok " "and Wolverton, Chris}," "doi = {10.1103/PhysRevB.96.024104}," "journal = {Physical Review B}," "pages = {024104}," "title = {{Including crystal structure attributes " "in machine learning models of formation energies " "via Voronoi tessellations}}," "url = {http://link.aps.org/doi/10.1103/PhysRevB.96.014107}," "volume = {96},year = {2017}}", "@article{jong_chen_notestine_persson_ceder_jain_asta_gamst_2016," "title={A Statistical Learning Framework for Materials Science: " "Application to Elastic Moduli of k-nary Inorganic Polycrystalline Compounds}, " "volume={6}, DOI={10.1038/srep34256}, number={1}, journal={Scientific Reports}, " "author={Jong, Maarten De and Chen, Wei and Notestine, Randy and Persson, " "Kristin and Ceder, Gerbrand and Jain, Anubhav and Asta, Mark and Gamst, Anthony}, " "year={2016}, month={Mar}}", ]
[docs] def implementors(self): return ["Logan Ward", "Aik Rui Tan"]
[docs] def add_type_feature(mydict,atomtype,name,feat): if atomtype in mydict: mydict[atomtype][name]=feat else: mydict[atomtype]={name:feat} return mydict