Source code for CoREMOF.models.cp_app.predictions

import numpy as np
import pandas as pd
from .descriptors import cv_features
import joblib
import glob
import copy

FEATURES = cv_features

[docs] def predict_Cv_ensemble_structure(models: list, FEATURES: list, df_features: pd.DataFrame, temperature: float) -> list: """Predict heat capacity using an ensemble of ML models for one structure. Args: models: list (ensemble) of ML models. FEATURES: features for ML model. df_features: pandas dataframe containing the features. temperature: target temperature. Returns: a list containing the gravimetric and molar heat capacity together with the uncertainty of the models. """ if len(df_features["structure_name"].unique())>2: raise ValueError("More than one structure in the features file...") df_site_structure = copy.deepcopy(df_features) structure_name = df_site_structure["structure_name"].unique()[0] print(structure_name) for model_idx,model in enumerate(models): df_site_structure["pCv_{}_predicted_{}".format(temperature, model_idx)]=model.predict(df_site_structure[FEATURES]) results=[] predicted_mol=[] predicted_gr=[] for model_idx in range(len(models)): sites=df_site_structure.loc[df_site_structure["structure_name"]==structure_name] predicted_mol.append(np.sum(sites["pCv_{}_predicted_{}".format(temperature,model_idx)])/len(sites)) predicted_gr.append(np.sum(sites["pCv_{}_predicted_{}".format(temperature,model_idx)])/np.sum(sites["site AtomicWeight"])) results.append({ "name": structure_name, "Cv_gravimetric_{}_mean".format(temperature): np.mean(predicted_gr), "Cv_gravimetric_{}_std".format(temperature): np.std(predicted_gr), "Cv_molar_{}_mean".format(temperature): np.mean(predicted_mol), "Cv_molar_{}_std".format(temperature): np.std(predicted_mol), }) return results
[docs] def predict_Cv_ensemble_structure_multitemperatures(path_to_models: str, structure_name: str, features_file: str="features.csv", FEATURES: list=cv_features, temperatures: list=[300.00], save_to: str="cv_predicted.csv") -> pd.DataFrame: """Predict heat capacity using an ensemble of ML models for a dataset. Args: models: ensemble of ML models. FEATURES: features for ML model. df_features: pandas dataframe containing the features. temperature: target temperature. Returns: a list containing the gravimetric and molar heat capacity together with the uncertainty of the models. """ df_features = pd.read_csv(features_file) df_features["structure_name"]=["_".join(n.split("_")[:-1]) for n in df_features["Unnamed: 0"]] df_features=df_features.loc[df_features["structure_name"]==structure_name] for i,temperature in enumerate(temperatures): models=[] print("loading models for:", temperature) modelnames = glob.glob("{}/{:.0f}/*".format(path_to_models, temperature)) models = [joblib.load(n) for n in modelnames] print("{} models loaded, predicting...".format(len(models))) if i==0: res= pd.DataFrame(predict_Cv_ensemble_structure(models, FEATURES, df_features, temperature)) all_results=res else: res= pd.DataFrame(predict_Cv_ensemble_structure(models, FEATURES, df_features, temperature)) all_results=all_results.merge(res, how="inner",on="name") if save_to: all_results.to_csv(save_to) return all_results