2 ml baseline

import%20marimo%0A%0A__generated_with%20%3D%20%220.23.1%22%0Aapp%20%3D%20marimo.App()%0A%0A%0A%40app.cell%0Adef%20_(mo)%3A%0A%20%20%20%20mo.md(r%22%22%22%0A%20%20%20%20%23%20Imports%0A%20%20%20%20%22%22%22)%0A%20%20%20%20return%0A%0A%0A%40app.cell%0Adef%20_()%3A%0A%20%20%20%20import%20logging%0A%20%20%20%20import%20joblib%0A%20%20%20%20import%20gc%0A%20%20%20%20import%20math%0A%20%20%20%20import%20warnings%0A%20%20%20%20from%20abc%20import%20ABC%2C%20abstractmethod%0A%20%20%20%20from%20pathlib%20import%20Path%0A%20%20%20%20from%20typing%20import%20Iterator%2C%20Literal%2C%20Optional%0A%20%20%20%20from%20urllib.request%20import%20urlretrieve%0A%0A%20%20%20%20import%20matplotlib.pyplot%20as%20plt%0A%20%20%20%20import%20numpy%20as%20np%0A%20%20%20%20import%20polars%20as%20pl%0A%20%20%20%20import%20marimo%20as%20mo%0A%20%20%20%20import%20altair%20as%20alt%0A%20%20%20%20import%20pingouin%20as%20pg%0A%20%20%20%20import%20seaborn%20as%20sns%0A%0A%20%20%20%20from%20scipy%20import%20stats%0A%20%20%20%20from%20scipy.stats%20import%20spearmanr%0A%20%20%20%20from%20sklearn.metrics%20import%20(%0A%20%20%20%20%20%20%20%20accuracy_score%2C%0A%20%20%20%20%20%20%20%20r2_score%2C%0A%20%20%20%20%20%20%20%20balanced_accuracy_score%2C%0A%20%20%20%20%20%20%20%20f1_score%2C%0A%20%20%20%20%20%20%20%20matthews_corrcoef%2C%0A%20%20%20%20%20%20%20%20mean_absolute_error%2C%0A%20%20%20%20%20%20%20%20mean_squared_error%2C%0A%20%20%20%20%20%20%20%20precision_score%2C%0A%20%20%20%20%20%20%20%20recall_score%2C%0A%20%20%20%20%20%20%20%20roc_auc_score%2C%0A%20%20%20%20)%0A%20%20%20%20from%20sklearn.model_selection._split%20import%20_BaseKFold%20as%20BaseKFold%0A%20%20%20%20from%20statsmodels.stats.anova%20import%20AnovaRM%0A%20%20%20%20from%20statsmodels.stats.libqsturng%20import%20psturng%2C%20qsturng%0A%0A%20%20%20%20from%20sklearn.ensemble%20import%20RandomForestClassifier%2C%20RandomForestRegressor%0A%20%20%20%20import%20xgboost%20as%20xgb%0A%0A%20%20%20%20import%20torch%0A%20%20%20%20from%20torch%20import%20nn%2C%20optim%0A%20%20%20%20from%20torch.functional%20import%20F%0A%20%20%20%20from%20torch.utils.data%20import%20DataLoader%0A%0A%20%20%20%20import%20lightning%20as%20L%0A%20%20%20%20from%20lightning%20import%20pytorch%20as%20pyl%0A%20%20%20%20from%20lightning.pytorch.callbacks.early_stopping%20import%20EarlyStopping%0A%0A%20%20%20%20from%20chemprop%20import%20data%2C%20featurizers%2C%20models%0A%20%20%20%20from%20chemprop%20import%20nn%20as%20chemnn%0A%0A%20%20%20%20from%20rdkit%20import%20Chem%0A%20%20%20%20from%20rdkit.Chem.Scaffolds%20import%20MurckoScaffold%0A%0A%20%20%20%20from%20skfp.preprocessing%20import%20ConformerGenerator%2C%20MolFromSmilesTransformer%0A%20%20%20%20from%20skfp.fingerprints%20import%20(%0A%20%20%20%20%20%20%20%20ECFPFingerprint%2C%0A%20%20%20%20%20%20%20%20MACCSFingerprint%2C%0A%20%20%20%20%20%20%20%20TopologicalTorsionFingerprint%2C%0A%20%20%20%20%20%20%20%20RDKitFingerprint%2C%0A%20%20%20%20%20%20%20%20AtomPairFingerprint%2C%0A%20%20%20%20%20%20%20%20AvalonFingerprint%2C%0A%20%20%20%20%20%20%20%20E3FPFingerprint%2C%0A%20%20%20%20%20%20%20%20MordredFingerprint%2C%0A%20%20%20%20%20%20%20%20MQNsFingerprint%2C%0A%20%20%20%20%20%20%20%20PubChemFingerprint%2C%0A%20%20%20%20)%0A%0A%20%20%20%20import%20gzip%0A%20%20%20%20import%20shutil%0A%20%20%20%20import%20subprocess%0A%20%20%20%20import%20sys%0A%20%20%20%20import%20tempfile%0A%0A%20%20%20%20import%20pandas%20as%20pd%0A%20%20%20%20import%20matplotlib.patches%20as%20mpatches%0A%20%20%20%20from%20tqdm.auto%20import%20tqdm%0A%20%20%20%20from%20typing%20import%20Iterable%0A%0A%20%20%20%20from%20rdkit%20import%20DataStructs%0A%20%20%20%20from%20rdkit.DataStructs%20import%20ExplicitBitVect%0A%0A%20%20%20%20return%20(%0A%20%20%20%20%20%20%20%20AnovaRM%2C%0A%20%20%20%20%20%20%20%20AtomPairFingerprint%2C%0A%20%20%20%20%20%20%20%20AvalonFingerprint%2C%0A%20%20%20%20%20%20%20%20BaseKFold%2C%0A%20%20%20%20%20%20%20%20Chem%2C%0A%20%20%20%20%20%20%20%20ConformerGenerator%2C%0A%20%20%20%20%20%20%20%20DataStructs%2C%0A%20%20%20%20%20%20%20%20E3FPFingerprint%2C%0A%20%20%20%20%20%20%20%20ECFPFingerprint%2C%0A%20%20%20%20%20%20%20%20ExplicitBitVect%2C%0A%20%20%20%20%20%20%20%20Iterable%2C%0A%20%20%20%20%20%20%20%20Iterator%2C%0A%20%20%20%20%20%20%20%20MACCSFingerprint%2C%0A%20%20%20%20%20%20%20%20MQNsFingerprint%2C%0A%20%20%20%20%20%20%20%20MolFromSmilesTransformer%2C%0A%20%20%20%20%20%20%20%20MordredFingerprint%2C%0A%20%20%20%20%20%20%20%20MurckoScaffold%2C%0A%20%20%20%20%20%20%20%20Optional%2C%0A%20%20%20%20%20%20%20%20Path%2C%0A%20%20%20%20%20%20%20%20PubChemFingerprint%2C%0A%20%20%20%20%20%20%20%20RDKitFingerprint%2C%0A%20%20%20%20%20%20%20%20RandomForestClassifier%2C%0A%20%20%20%20%20%20%20%20RandomForestRegressor%2C%0A%20%20%20%20%20%20%20%20TopologicalTorsionFingerprint%2C%0A%20%20%20%20%20%20%20%20accuracy_score%2C%0A%20%20%20%20%20%20%20%20balanced_accuracy_score%2C%0A%20%20%20%20%20%20%20%20f1_score%2C%0A%20%20%20%20%20%20%20%20gc%2C%0A%20%20%20%20%20%20%20%20gzip%2C%0A%20%20%20%20%20%20%20%20math%2C%0A%20%20%20%20%20%20%20%20matthews_corrcoef%2C%0A%20%20%20%20%20%20%20%20mean_absolute_error%2C%0A%20%20%20%20%20%20%20%20mean_squared_error%2C%0A%20%20%20%20%20%20%20%20mo%2C%0A%20%20%20%20%20%20%20%20mpatches%2C%0A%20%20%20%20%20%20%20%20np%2C%0A%20%20%20%20%20%20%20%20pd%2C%0A%20%20%20%20%20%20%20%20pg%2C%0A%20%20%20%20%20%20%20%20pl%2C%0A%20%20%20%20%20%20%20%20plt%2C%0A%20%20%20%20%20%20%20%20precision_score%2C%0A%20%20%20%20%20%20%20%20psturng%2C%0A%20%20%20%20%20%20%20%20qsturng%2C%0A%20%20%20%20%20%20%20%20r2_score%2C%0A%20%20%20%20%20%20%20%20recall_score%2C%0A%20%20%20%20%20%20%20%20roc_auc_score%2C%0A%20%20%20%20%20%20%20%20shutil%2C%0A%20%20%20%20%20%20%20%20sns%2C%0A%20%20%20%20%20%20%20%20spearmanr%2C%0A%20%20%20%20%20%20%20%20stats%2C%0A%20%20%20%20%20%20%20%20subprocess%2C%0A%20%20%20%20%20%20%20%20sys%2C%0A%20%20%20%20%20%20%20%20tempfile%2C%0A%20%20%20%20%20%20%20%20torch%2C%0A%20%20%20%20%20%20%20%20tqdm%2C%0A%20%20%20%20%20%20%20%20warnings%2C%0A%20%20%20%20%20%20%20%20xgb%2C%0A%20%20%20%20)%0A%0A%0A%40app.cell%0Adef%20_(mo)%3A%0A%20%20%20%20mo.md(r%22%22%22%0A%20%20%20%20%23%23%20Model%20classes%0A%0A%20%20%20%20Four%20model%20types%20share%20a%20stable%20API%3A%0A%0A%20%20%20%20%60%60%60%0A%20%20%20%20model.train(train_df%2C%20target_col%2C%20task%2C%20**kwargs)%0A%20%20%20%20model.predict(df)%20%20%20%20%20%20%20%20%20%20-%3E%20np.ndarray%0A%20%20%20%20%60%60%60%0A%0A%20%20%20%20%7C%20Class%20%7C%20Backend%20%7C%20Input%20features%20%7C%0A%20%20%20%20%7C---%7C---%7C---%7C%0A%20%20%20%20%7C%20%60RandomForestModel%60%20%7C%20sklearn%20RF%20%7C%20fingerprint%20column%20%7C%0A%20%20%20%20%7C%20%60BoostedTreesModel%60%20%7C%20XGBoost%20%7C%20fingerprint%20column%20%7C%0A%20%20%20%20%7C%20%60ChempropModel%60%20%7C%20Chemprop%20v2%20MPNN%20from%20scratch%20%7C%20SMILES%20column%20%7C%0A%20%20%20%20%7C%20%60ChempropChemeleonModel%60%20%7C%20Chemprop%20v2%20fine-tuned%20from%20%5BCheMeleon%5D(https%3A%2F%2Fgithub.com%2FJacksonBurns%2Fchemeleon)%20backbone%20%7C%20SMILES%20column%20%7C%0A%20%20%20%20%7C%20%60MeanBaseline%60%20%7C%20Predict%20all%20values%20as%20mean%20pEC50%20on%20train%20set%20%7C%20None%20%7C%0A%20%20%20%20%7C%20%60NearestNeighbourBaseline%60%20%7C%20Predict%20value%20as%20pEC50%20of%20NN%20in%20train%20set%20%7C%20fingerprint%20column%20%7C%0A%0A%20%20%20%20%60task%60%20is%20either%20%60%22regression%22%60%20or%20%60%22classification%22%60.%0A%20%20%20%20Classification%20%60predict()%60%20returns%20the%20probability%20of%20the%20positive%20class.%0A%0A%20%20%20%20The%20last%20two%20models%20are%20not%20expected%20to%20provide%20good%20performance%2C%20but%20can%20be%20a%20useful%0A%20%20%20%20way%20to%20obtain%20a%20worst-case%20MAE%20value.%0A%20%20%20%20The%20%60MeanBaseline%60%20model%20should%20provide%20an%20R%C2%B2%20of%20zero%20and%20a%20rho%20value%20of%20NaN.%0A%20%20%20%20%22%22%22)%0A%20%20%20%20return%0A%0A%0A%40app.cell%0Adef%20_(np%2C%20pl)%3A%0A%20%20%20%20def%20extract_fp_matrix(df%3A%20pl.DataFrame%2C%20fp_col%3A%20str)%20-%3E%20np.ndarray%3A%0A%20%20%20%20%20%20%20%20%22%22%22%0A%20%20%20%20%20%20%20%20Extract%20a%202-D%20float32%20feature%20matrix%20from%20a%20fingerprint%20column.%0A%0A%20%20%20%20%20%20%20%20The%20column%20is%20expected%20to%20hold%20numpy%20arrays%20of%20equal%20length%20as%20produced%0A%20%20%20%20%20%20%20%20by%20generate_fingerprint.%0A%0A%20%20%20%20%20%20%20%20Args%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20df%3A%20DataFrame%20with%20a%20fingerprint%20column.%0A%20%20%20%20%20%20%20%20%20%20%20%20fp_col%3A%20Column%20name.%0A%0A%20%20%20%20%20%20%20%20Returns%3A%0A%20%20%20%20%20%20%20%20%20%20%20%202-D%20array%20of%20shape%20(n_compounds%2C%20fp_size).%0A%20%20%20%20%20%20%20%20%22%22%22%0A%20%20%20%20%20%20%20%20return%20np.stack(df%5Bfp_col%5D.to_list()).astype(np.float32)%0A%0A%0A%0A%0A%0A%20%20%20%20return%20(extract_fp_matrix%2C)%0A%0A%0A%40app.cell%0Adef%20_(RandomForestClassifier%2C%20RandomForestRegressor%2C%20np)%3A%0A%20%20%20%20class%20RandomForestModel%3A%0A%20%20%20%20%20%20%20%20%22%22%22Scikit-learn%20Random%20Forest%20model%20with%20a%20unified%20fit%2Fpredict%20interface.%22%22%22%0A%0A%20%20%20%20%20%20%20%20def%20__init__(self%2C%20pred_type%3A%20str%20%3D%20%22classification%22)%20-%3E%20None%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20%22%22%22%0A%20%20%20%20%20%20%20%20%20%20%20%20Args%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20pred_type%3A%20%22classification%22%20(RandomForestClassifier)%20or%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%22regression%22%20(RandomForestRegressor).%0A%0A%20%20%20%20%20%20%20%20%20%20%20%20Raises%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20ValueError%3A%20If%20pred_type%20is%20not%20%22classification%22%20or%20%22regression%22.%0A%20%20%20%20%20%20%20%20%20%20%20%20%22%22%22%0A%20%20%20%20%20%20%20%20%20%20%20%20self.model%20%3D%20None%0A%20%20%20%20%20%20%20%20%20%20%20%20self.pred_type%20%3D%20pred_type%0A%20%20%20%20%20%20%20%20%20%20%20%20if%20pred_type%20%3D%3D%20%22classification%22%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20self.model%20%3D%20RandomForestClassifier(n_jobs%3D-1)%0A%20%20%20%20%20%20%20%20%20%20%20%20elif%20pred_type%20%3D%3D%20%22regression%22%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20self.model%20%3D%20RandomForestRegressor(n_jobs%3D-1)%0A%20%20%20%20%20%20%20%20%20%20%20%20else%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20raise%20ValueError(%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%22pred_type%20must%20be%20either%20'classification'%20or%20'regression'%22%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20)%0A%0A%20%20%20%20%20%20%20%20def%20train(self%2C%20X_train%3A%20np.ndarray%2C%20y_train%3A%20np.ndarray)%20-%3E%20None%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20%22%22%22%0A%20%20%20%20%20%20%20%20%20%20%20%20Fit%20the%20model%20on%20the%20training%20data.%0A%0A%20%20%20%20%20%20%20%20%20%20%20%20Args%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20X_train%3A%20Training%20feature%20matrix.%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20y_train%3A%20Training%20labels%20or%20values.%0A%20%20%20%20%20%20%20%20%20%20%20%20%22%22%22%0A%20%20%20%20%20%20%20%20%20%20%20%20self.model.fit(X_train%2C%20y_train)%0A%0A%20%20%20%20%20%20%20%20def%20predict(self%2C%20X_test%3A%20np.ndarray)%20-%3E%20np.ndarray%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20%22%22%22%0A%20%20%20%20%20%20%20%20%20%20%20%20Generate%20predictions%20for%20the%20test%20set.%0A%0A%20%20%20%20%20%20%20%20%20%20%20%20Args%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20X_test%3A%20Test%20feature%20matrix.%0A%0A%20%20%20%20%20%20%20%20%20%20%20%20Returns%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20Predicted%20probabilities%20(classification)%20or%20values%20(regression).%0A%20%20%20%20%20%20%20%20%20%20%20%20%22%22%22%0A%20%20%20%20%20%20%20%20%20%20%20%20if%20self.pred_type%20%3D%3D%20%22classification%22%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20return%20self.model.predict_proba(X_test)%5B%3A%2C%201%5D%0A%20%20%20%20%20%20%20%20%20%20%20%20else%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20return%20self.model.predict(X_test)%0A%0A%20%20%20%20return%20(RandomForestModel%2C)%0A%0A%0A%40app.cell%0Adef%20_(np%2C%20xgb)%3A%0A%20%20%20%20class%20BoostedTreesModel%3A%0A%20%20%20%20%20%20%20%20%22%22%22XGBoost%20gradient-boosted%20tree%20model%20with%20a%20unified%20fit%2Fpredict%20interface.%22%22%22%0A%0A%20%20%20%20%20%20%20%20def%20__init__(self%2C%20pred_type%3A%20str%20%3D%20%22classification%22)%20-%3E%20None%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20%22%22%22%0A%20%20%20%20%20%20%20%20%20%20%20%20Args%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20pred_type%3A%20%22classification%22%20(XGBClassifier)%20or%20%22regression%22%20(XGBRegressor).%0A%0A%20%20%20%20%20%20%20%20%20%20%20%20Raises%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20ValueError%3A%20If%20pred_type%20is%20not%20%22classification%22%20or%20%22regression%22.%0A%20%20%20%20%20%20%20%20%20%20%20%20%22%22%22%0A%20%20%20%20%20%20%20%20%20%20%20%20self.model%20%3D%20None%0A%20%20%20%20%20%20%20%20%20%20%20%20self.pred_type%20%3D%20pred_type%0A%20%20%20%20%20%20%20%20%20%20%20%20if%20pred_type%20%3D%3D%20%22classification%22%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20self.model%20%3D%20xgb.XGBClassifier(tree_method%3D%22hist%22%2C%20%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20early_stopping_rounds%3D2%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20n_jobs%3D-1)%0A%20%20%20%20%20%20%20%20%20%20%20%20elif%20pred_type%20%3D%3D%20%22regression%22%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20self.model%20%3D%20xgb.XGBRegressor(tree_method%3D%22hist%22%2C%20%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20early_stopping_rounds%3D2%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20n_jobs%3D-1)%0A%20%20%20%20%20%20%20%20%20%20%20%20else%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20raise%20ValueError(%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%22pred_type%20must%20be%20either%20'classification'%20or%20'regression'%22%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20)%0A%0A%20%20%20%20%20%20%20%20def%20train(self%2C%20X_train%3A%20np.ndarray%2C%20y_train%3A%20np.ndarray%2C%20X_val%3A%20np.ndarray%2C%20y_val%3A%20np.ndarray)%20-%3E%20None%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20%22%22%22%0A%20%20%20%20%20%20%20%20%20%20%20%20Fit%20the%20model%20using%20training%20data%20with%20an%20evaluation%20set%20for%20early%20stopping.%0A%0A%20%20%20%20%20%20%20%20%20%20%20%20Args%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20X_train%3A%20Training%20feature%20matrix.%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20y_train%3A%20Training%20labels%20or%20values.%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20X_val%3A%20Validation%20feature%20matrix%20used%20for%20early%20stopping.%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20y_val%3A%20Validation%20labels%20or%20values.%0A%20%20%20%20%20%20%20%20%20%20%20%20%22%22%22%0A%20%20%20%20%20%20%20%20%20%20%20%20self.model.fit(X_train%2C%20y_train%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20eval_set%3D%5B(X_val%2C%20y_val)%5D%2C%20verbose%3DFalse)%0A%0A%20%20%20%20%20%20%20%20def%20predict(self%2C%20X_test%3A%20np.ndarray)%20-%3E%20np.ndarray%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20%22%22%22%0A%20%20%20%20%20%20%20%20%20%20%20%20Generate%20predictions%20for%20the%20test%20set.%0A%0A%20%20%20%20%20%20%20%20%20%20%20%20Args%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20X_test%3A%20Test%20feature%20matrix.%0A%0A%20%20%20%20%20%20%20%20%20%20%20%20Returns%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20Predicted%20probabilities%20(classification)%20or%20values%20(regression).%0A%20%20%20%20%20%20%20%20%20%20%20%20%22%22%22%0A%20%20%20%20%20%20%20%20%20%20%20%20if%20self.pred_type%20%3D%3D%20%22classification%22%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20return%20self.model.predict_proba(X_test)%5B%3A%2C%201%5D%0A%20%20%20%20%20%20%20%20%20%20%20%20else%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20return%20self.model.predict(X_test)%0A%0A%0A%0A%20%20%20%20return%20(BoostedTreesModel%2C)%0A%0A%0A%40app.cell%0Adef%20_(Optional%2C%20np)%3A%0A%20%20%20%20class%20MeanBaseline%3A%0A%20%20%20%20%20%20%20%20%22%22%22%0A%20%20%20%20%20%20%20%20Non-ML%20baseline%20that%20predicts%20the%20training-set%20mean%20for%20every%20test%20compound.%0A%0A%20%20%20%20%20%20%20%20This%20is%20the%20simplest%20possible%20baseline%3A%20it%20ignores%20all%20molecular%20structure%0A%20%20%20%20%20%20%20%20and%20returns%20a%20constant%20prediction%20equal%20to%20the%20mean%20of%20the%20training%20labels.%0A%20%20%20%20%20%20%20%20Any%20useful%20model%20must%20beat%20this.%0A%20%20%20%20%20%20%20%20%22%22%22%0A%0A%20%20%20%20%20%20%20%20def%20__init__(self)%20-%3E%20None%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20self._mean%3A%20Optional%5Bfloat%5D%20%3D%20None%0A%0A%20%20%20%20%20%20%20%20def%20train(self%2C%20X_train%3A%20np.ndarray%2C%20y_train%3A%20np.ndarray)%20-%3E%20None%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20%22%22%22%0A%20%20%20%20%20%20%20%20%20%20%20%20Store%20the%20training-set%20mean.%0A%0A%20%20%20%20%20%20%20%20%20%20%20%20Args%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20X_train%3A%20Feature%20matrix%20(unused%20%E2%80%94%20kept%20for%20API%20compatibility).%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20y_train%3A%20Training%20labels%20or%20values.%0A%20%20%20%20%20%20%20%20%20%20%20%20%22%22%22%0A%20%20%20%20%20%20%20%20%20%20%20%20self._mean%20%3D%20float(np.mean(y_train))%0A%0A%20%20%20%20%20%20%20%20def%20predict(self%2C%20X_test%3A%20np.ndarray)%20-%3E%20np.ndarray%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20%22%22%22%0A%20%20%20%20%20%20%20%20%20%20%20%20Return%20the%20training-set%20mean%20for%20every%20test%20compound.%0A%0A%20%20%20%20%20%20%20%20%20%20%20%20Args%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20X_test%3A%20Test%20feature%20matrix%20(only%20shape%20is%20used).%0A%0A%20%20%20%20%20%20%20%20%20%20%20%20Returns%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%201-D%20array%20of%20length%20n_test%20filled%20with%20the%20training%20mean.%0A%20%20%20%20%20%20%20%20%20%20%20%20%22%22%22%0A%20%20%20%20%20%20%20%20%20%20%20%20return%20np.full(X_test.shape%5B0%5D%2C%20self._mean%2C%20dtype%3Dnp.float32)%0A%0A%20%20%20%20class%20NearestNeighbourBaseline%3A%0A%20%20%20%20%20%20%20%20%22%22%22%0A%20%20%20%20%20%20%20%20Simple%20ML%20baseline%20that%20predicts%20the%20training%20label%20of%20the%20most%20similar%0A%20%20%20%20%20%20%20%20training%20compound%20(1-NN%20regression%20by%20Tanimoto%20similarity).%0A%0A%20%20%20%20%20%20%20%20Fingerprints%20are%20treated%20as%20binary%20vectors.%20Tanimoto%20similarity%20is%20computed%0A%20%20%20%20%20%20%20%20efficiently%20with%20matrix%20operations%3A%0A%0A%20%20%20%20%20%20%20%20%20%20%20%20T(a%2C%20b)%20%3D%20%7Ca%20%E2%88%A9%20b%7C%20%2F%20%7Ca%20%E2%88%AA%20b%7C%20%20%3D%20%20dot(a%2C%20b)%20%2F%20(%7Ca%7C%20%2B%20%7Cb%7C%20-%20dot(a%2C%20b))%0A%0A%20%20%20%20%20%20%20%20where%20%7Ca%7C%20%3D%20sum%20of%20set%20bits%20%3D%20dot(a%2C%20a).%0A%20%20%20%20%20%20%20%20%22%22%22%0A%0A%20%20%20%20%20%20%20%20def%20__init__(self)%20-%3E%20None%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20self._X_train%3A%20Optional%5Bnp.ndarray%5D%20%3D%20None%0A%20%20%20%20%20%20%20%20%20%20%20%20self._y_train%3A%20Optional%5Bnp.ndarray%5D%20%3D%20None%0A%0A%20%20%20%20%20%20%20%20def%20train(self%2C%20X_train%3A%20np.ndarray%2C%20y_train%3A%20np.ndarray)%20-%3E%20None%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20%22%22%22%0A%20%20%20%20%20%20%20%20%20%20%20%20Store%20the%20training%20fingerprints%20and%20labels.%0A%0A%20%20%20%20%20%20%20%20%20%20%20%20Args%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20X_train%3A%20Binary%20fingerprint%20matrix%20of%20shape%20(n_train%2C%20fp_size).%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20y_train%3A%20Training%20target%20values%20of%20shape%20(n_train%2C).%0A%20%20%20%20%20%20%20%20%20%20%20%20%22%22%22%0A%20%20%20%20%20%20%20%20%20%20%20%20self._X_train%20%3D%20X_train.astype(np.float32)%0A%20%20%20%20%20%20%20%20%20%20%20%20self._y_train%20%3D%20y_train.astype(np.float32)%0A%0A%20%20%20%20%20%20%20%20def%20predict(self%2C%20X_test%3A%20np.ndarray%2C%20chunk_size%3A%20int%20%3D%2064)%20-%3E%20np.ndarray%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20%22%22%22%0A%20%20%20%20%20%20%20%20%20%20%20%20Return%20the%20training%20label%20of%20the%20nearest%20neighbour%20for%20each%20test%20compound.%0A%0A%20%20%20%20%20%20%20%20%20%20%20%20Tanimoto%20similarity%20is%20computed%20in%20row-chunks%20of%20test%20compounds%20so%20that%0A%20%20%20%20%20%20%20%20%20%20%20%20only%20a%20(chunk_size%20%C3%97%20n_train)%20matrix%20exists%20in%20memory%20at%20any%20time%2C%20rather%0A%20%20%20%20%20%20%20%20%20%20%20%20than%20the%20full%20(n_test%20%C3%97%20n_train)%20matrix.%0A%0A%20%20%20%20%20%20%20%20%20%20%20%20Args%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20X_test%3A%20Binary%20fingerprint%20matrix%20of%20shape%20(n_test%2C%20fp_size).%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20chunk_size%3A%20Number%20of%20test%20compounds%20processed%20per%20chunk.%0A%0A%20%20%20%20%20%20%20%20%20%20%20%20Returns%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%201-D%20array%20of%20shape%20(n_test%2C)%20with%20the%20nearest-neighbour%20predictions.%0A%20%20%20%20%20%20%20%20%20%20%20%20%22%22%22%0A%20%20%20%20%20%20%20%20%20%20%20%20X_test%20%3D%20X_test.astype(np.float32)%0A%20%20%20%20%20%20%20%20%20%20%20%20train_counts%20%3D%20self._X_train.sum(axis%3D1)%20%20%23%20(n_train%2C)%20%E2%80%94%20computed%20once%0A%20%20%20%20%20%20%20%20%20%20%20%20nn_idx%20%3D%20np.empty(X_test.shape%5B0%5D%2C%20dtype%3Dnp.intp)%0A%0A%20%20%20%20%20%20%20%20%20%20%20%20for%20start%20in%20range(0%2C%20X_test.shape%5B0%5D%2C%20chunk_size)%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20chunk%20%3D%20X_test%5Bstart%20%3A%20start%20%2B%20chunk_size%5D%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20dot%20%3D%20chunk%20%40%20self._X_train.T%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%23%20(chunk%2C%20n_train)%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20test_counts%20%3D%20chunk.sum(axis%3D1)%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%23%20(chunk%2C)%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20union%20%3D%20test_counts%5B%3A%2C%20None%5D%20%2B%20train_counts%5BNone%2C%20%3A%5D%20-%20dot%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20tanimoto%20%3D%20np.where(union%20%3E%200%2C%20dot%20%2F%20union%2C%200.0)%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20nn_idx%5Bstart%20%3A%20start%20%2B%20chunk_size%5D%20%3D%20np.argmax(tanimoto%2C%20axis%3D1)%0A%0A%20%20%20%20%20%20%20%20%20%20%20%20return%20self._y_train%5Bnn_idx%5D%0A%0A%20%20%20%20return%20MeanBaseline%2C%20NearestNeighbourBaseline%0A%0A%0A%40app.cell%0Adef%20_(Optional%2C%20Path%2C%20np%2C%20pl%2C%20shutil%2C%20subprocess%2C%20sys%2C%20tempfile%2C%20torch)%3A%0A%20%20%20%20%23%20Resolve%20the%20chemprop%20CLI%20from%20the%20same%20venv%20as%20the%20running%20interpreter%0A%20%20%20%20_CHEMPROP_BIN%20%3D%20Path(sys.executable).parent%20%2F%20%22chemprop%22%0A%0A%20%20%20%20%23%20Persistent%20temp%20directories%20%E2%80%94%20one%20per%20model%20type%2C%20reused%20across%20CV%20folds.%0A%20%20%20%20%23%20Using%20a%20fixed%20path%20(not%20TemporaryDirectory)%20so%20the%20folder%20survives%20between%0A%20%20%20%20%23%20train()%20and%20predict()%20calls%20within%20the%20same%20session.%0A%20%20%20%20_CHEMPROP_MODEL_DIR%20%20%3D%20Path(tempfile.gettempdir())%20%2F%20%22chemprop_scratch_model%22%0A%20%20%20%20_CHEMELEON_MODEL_DIR%20%3D%20Path(tempfile.gettempdir())%20%2F%20%22chemprop_chemeleon_model%22%0A%0A%20%20%20%20def%20_get_device()%20-%3E%20str%3A%0A%20%20%20%20%20%20%20%20%22%22%22%0A%20%20%20%20%20%20%20%20Detect%20and%20return%20the%20best%20available%20compute%20device%20for%20PyTorch.%0A%0A%20%20%20%20%20%20%20%20Returns%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20%22cuda%22%20if%20an%20NVIDIA%2FAMD%20GPU%20is%20available%2C%20%22mps%22%20if%20running%20on%20Apple%0A%20%20%20%20%20%20%20%20%20%20%20%20Silicon%20with%20Metal%20Performance%20Shaders%2C%20otherwise%20%22cpu%22.%0A%20%20%20%20%20%20%20%20%22%22%22%0A%20%20%20%20%20%20%20%20return%20(%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%22cuda%22%20%23%20Device%20for%20NVIDIA%20or%20AMD%20GPUs%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20if%20torch.cuda.is_available()%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20else%20%22mps%22%20%23%20Device%20for%20Apple%20Silicon%20(Metal%20Performance%20Shaders)%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20if%20torch.backends.mps.is_available()%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20else%20%22cpu%22%0A%20%20%20%20%20%20%20%20%20%20%20%20)%0A%0A%20%20%20%20def%20_write_smiles_csv(%0A%20%20%20%20%20%20%20%20smiles%3A%20list%5Bstr%5D%2C%0A%20%20%20%20%20%20%20%20targets%3A%20Optional%5Bnp.ndarray%5D%2C%0A%20%20%20%20%20%20%20%20path%3A%20Path%2C%0A%20%20%20%20%20%20%20%20target_col%3A%20str%2C%0A%20%20%20%20)%20-%3E%20None%3A%0A%20%20%20%20%20%20%20%20%22%22%22%0A%20%20%20%20%20%20%20%20Write%20a%20CSV%20file%20with%20a%20smiles%20column%20and%20an%20optional%20target%20column.%0A%0A%20%20%20%20%20%20%20%20Args%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20smiles%3A%20List%20of%20SMILES%20strings.%0A%20%20%20%20%20%20%20%20%20%20%20%20targets%3A%201-D%20array%20of%20target%20values%2C%20or%20None%20for%20inference-only%20files.%0A%20%20%20%20%20%20%20%20%20%20%20%20path%3A%20Destination%20file%20path.%0A%20%20%20%20%20%20%20%20%20%20%20%20target_col%3A%20Name%20of%20the%20target%20column.%0A%20%20%20%20%20%20%20%20%22%22%22%0A%20%20%20%20%20%20%20%20if%20targets%20is%20not%20None%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20df%20%3D%20pl.DataFrame(%7B%22smiles%22%3A%20smiles%2C%20target_col%3A%20targets.flatten().tolist()%7D)%0A%20%20%20%20%20%20%20%20else%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20df%20%3D%20pl.DataFrame(%7B%22smiles%22%3A%20smiles%7D)%0A%20%20%20%20%20%20%20%20df.write_csv(path)%0A%0A%20%20%20%20%23%20Single%20log%20file%20for%20all%20chemprop%20CLI%20calls%20%E2%80%94%20appended%20across%20folds.%0A%20%20%20%20_CHEMPROP_LOG%20%3D%20Path(tempfile.gettempdir())%20%2F%20%22chemprop_cli.log%22%0A%0A%20%20%20%20def%20_run_chemprop_cli(args%3A%20list%5Bstr%5D)%20-%3E%20None%3A%0A%20%20%20%20%20%20%20%20%22%22%22%0A%20%20%20%20%20%20%20%20Run%20the%20chemprop%20CLI%20as%20a%20subprocess%2C%20redirecting%20all%20output%20to%20a%20log%20file.%0A%0A%20%20%20%20%20%20%20%20stdout%20and%20stderr%20are%20appended%20to%20_CHEMPROP_LOG%20so%20the%20notebook%20stays%0A%20%20%20%20%20%20%20%20quiet.%20On%20failure%20the%20tail%20of%20the%20log%20is%20printed%20to%20help%20diagnose%20the%20error.%0A%0A%20%20%20%20%20%20%20%20Args%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20args%3A%20Argument%20list%20passed%20after%20the%20%60chemprop%60%20binary.%0A%0A%20%20%20%20%20%20%20%20Raises%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20RuntimeError%3A%20If%20the%20process%20exits%20with%20a%20non-zero%20return%20code.%0A%20%20%20%20%20%20%20%20%22%22%22%0A%20%20%20%20%20%20%20%20cmd%20%3D%20%5Bstr(_CHEMPROP_BIN)%5D%20%2B%20args%0A%20%20%20%20%20%20%20%20with%20open(_CHEMPROP_LOG%2C%20%22a%22)%20as%20_log%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20_log.write(f%22%5Cn%7B'%3D'*60%7D%5CnCMD%3A%20%7B'%20'.join(cmd)%7D%5Cn%7B'%3D'*60%7D%5Cn%22)%0A%20%20%20%20%20%20%20%20%20%20%20%20result%20%3D%20subprocess.run(cmd%2C%20stdout%3D_log%2C%20stderr%3D_log%2C%20text%3DTrue)%0A%20%20%20%20%20%20%20%20if%20result.returncode%20!%3D%200%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20%23%20Print%20only%20the%20last%2030%20lines%20of%20the%20log%20to%20surface%20the%20error%0A%20%20%20%20%20%20%20%20%20%20%20%20lines%20%3D%20_CHEMPROP_LOG.read_text().splitlines()%0A%20%20%20%20%20%20%20%20%20%20%20%20print(%22%5Cn%22.join(lines%5B-30%3A%5D))%0A%20%20%20%20%20%20%20%20%20%20%20%20raise%20RuntimeError(%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20f%22chemprop%20CLI%20failed%20(exit%20%7Bresult.returncode%7D).%20%22%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20f%22Full%20log%3A%20%7B_CHEMPROP_LOG%7D%22%0A%20%20%20%20%20%20%20%20%20%20%20%20)%0A%0A%20%20%20%20class%20ChempropModel%3A%0A%20%20%20%20%20%20%20%20%22%22%22%0A%20%20%20%20%20%20%20%20Chemprop%20D-MPNN%20trained%20from%20scratch%20via%20the%20chemprop%20CLI.%0A%0A%20%20%20%20%20%20%20%20train()%20and%20predict()%20shell%20out%20to%20%60chemprop%20train%60%20%2F%20%60chemprop%20predict%60%0A%20%20%20%20%20%20%20%20rather%20than%20using%20the%20Python%20API%2C%20avoiding%20MPS%20memory%20issues%20when%20running%0A%20%20%20%20%20%20%20%20many%20CV%20folds%20inside%20a%20notebook%20kernel.%0A%0A%20%20%20%20%20%20%20%20The%20trained%20model%20is%20written%20to%20a%20fixed%20temporary%20directory%0A%20%20%20%20%20%20%20%20(%2Ftmp%2Fchemprop_scratch_model)%20which%20is%20overwritten%20on%20each%20train()%20call%0A%20%20%20%20%20%20%20%20so%20no%20disk%20space%20accumulates%20across%20folds.%0A%20%20%20%20%20%20%20%20%22%22%22%0A%0A%20%20%20%20%20%20%20%20def%20__init__(%0A%20%20%20%20%20%20%20%20%20%20%20%20self%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20pred_type%3A%20str%20%3D%20%22regression%22%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20model_dir%3A%20Path%20%3D%20_CHEMPROP_MODEL_DIR%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20epochs%3A%20int%20%3D%2050%2C%0A%20%20%20%20%20%20%20%20)%20-%3E%20None%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20%22%22%22%0A%20%20%20%20%20%20%20%20%20%20%20%20Args%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20pred_type%3A%20%22regression%22%20or%20%22classification%22.%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20model_dir%3A%20Directory%20where%20the%20CLI%20writes%20model%20checkpoints.%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20Reused%20(overwritten)%20on%20every%20train()%20call.%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20epochs%3A%20Maximum%20number%20of%20training%20epochs.%0A%20%20%20%20%20%20%20%20%20%20%20%20%22%22%22%0A%20%20%20%20%20%20%20%20%20%20%20%20if%20pred_type%20not%20in%20(%22regression%22%2C%20%22classification%22)%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20raise%20ValueError(%22pred_type%20must%20be%20'regression'%20or%20'classification'%22)%0A%20%20%20%20%20%20%20%20%20%20%20%20self.pred_type%20%3D%20pred_type%0A%20%20%20%20%20%20%20%20%20%20%20%20self.model_dir%20%3D%20model_dir%0A%20%20%20%20%20%20%20%20%20%20%20%20self.epochs%20%20%20%20%3D%20epochs%0A%20%20%20%20%20%20%20%20%20%20%20%20self.target_col%3A%20Optional%5Bstr%5D%20%3D%20None%20%20%23%20set%20during%20train()%0A%0A%20%20%20%20%20%20%20%20def%20train(%0A%20%20%20%20%20%20%20%20%20%20%20%20self%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20X_train%3A%20list%5Bstr%5D%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20y_train%3A%20np.ndarray%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20X_val%3A%20%20%20list%5Bstr%5D%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20y_val%3A%20%20%20np.ndarray%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20target_col%3A%20str%20%3D%20%22target%22%2C%0A%20%20%20%20%20%20%20%20)%20-%3E%20None%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20%22%22%22%0A%20%20%20%20%20%20%20%20%20%20%20%20Train%20the%20model%20by%20calling%20%60chemprop%20train%60%20via%20subprocess.%0A%0A%20%20%20%20%20%20%20%20%20%20%20%20Writes%20temporary%20CSV%20files%20for%20train%20and%20val%20sets%2C%20runs%20the%20CLI%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20then%20removes%20the%20CSVs.%20The%20model%20directory%20is%20cleared%20before%20each%0A%20%20%20%20%20%20%20%20%20%20%20%20run%20so%20old%20checkpoints%20do%20not%20accumulate.%0A%0A%20%20%20%20%20%20%20%20%20%20%20%20Args%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20X_train%3A%20SMILES%20strings%20for%20training.%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20y_train%3A%20Training%20targets%2C%20shape%20(n%2C)%20or%20(n%2C%201).%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20X_val%3A%20%20%20SMILES%20strings%20for%20validation%20(early%20stopping).%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20y_val%3A%20%20%20Validation%20targets%2C%20shape%20(n%2C)%20or%20(n%2C%201).%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20target_col%3A%20Column%20name%20used%20in%20the%20temporary%20CSV%20files.%0A%20%20%20%20%20%20%20%20%20%20%20%20%22%22%22%0A%20%20%20%20%20%20%20%20%20%20%20%20self.target_col%20%3D%20target_col%0A%20%20%20%20%20%20%20%20%20%20%20%20tmp%20%3D%20Path(tempfile.gettempdir())%0A%20%20%20%20%20%20%20%20%20%20%20%20train_csv%20%3D%20tmp%20%2F%20%22chemprop_train.csv%22%0A%20%20%20%20%20%20%20%20%20%20%20%20val_csv%20%20%20%3D%20tmp%20%2F%20%22chemprop_val.csv%22%0A%0A%20%20%20%20%20%20%20%20%20%20%20%20_write_smiles_csv(X_train%2C%20y_train%2C%20train_csv%2C%20target_col)%0A%20%20%20%20%20%20%20%20%20%20%20%20_write_smiles_csv(X_val%2C%20%20%20y_val%2C%20%20%20val_csv%2C%20%20%20target_col)%0A%0A%20%20%20%20%20%20%20%20%20%20%20%20%23%20Remove%20stale%20checkpoints%20so%20the%20CLI%20starts%20fresh%20each%20fold%0A%20%20%20%20%20%20%20%20%20%20%20%20if%20self.model_dir.exists()%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20shutil.rmtree(self.model_dir)%0A%0A%20%20%20%20%20%20%20%20%20%20%20%20task_type%20%3D%20%22regression%22%20if%20self.pred_type%20%3D%3D%20%22regression%22%20else%20%22binary%22%0A%20%20%20%20%20%20%20%20%20%20%20%20%23%20Pass%20val_csv%20twice%20(as%20val%20and%20as%20dummy%20test)%20so%20the%20CLI%20tracks%0A%20%20%20%20%20%20%20%20%20%20%20%20%23%20val_loss%20for%20early%20stopping.%20Two-file%20mode%20triggers%20a%20validation%0A%20%20%20%20%20%20%20%20%20%20%20%20%23%20error%20unless%20--split-sizes%20is%20also%20set.%0A%20%20%20%20%20%20%20%20%20%20%20%20_run_chemprop_cli(%5B%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%22train%22%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%22--data-path%22%2C%20str(train_csv)%2C%20str(val_csv)%2C%20str(val_csv)%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%22--smiles-columns%22%2C%20%22smiles%22%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%22--target-columns%22%2C%20target_col%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%22--task-type%22%2C%20task_type%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%22--accelerator%22%2C%20_get_device()%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%22--epochs%22%2C%20str(self.epochs)%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%22--save-dir%22%2C%20str(self.model_dir)%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20%5D)%0A%0A%20%20%20%20%20%20%20%20%20%20%20%20train_csv.unlink(missing_ok%3DTrue)%0A%20%20%20%20%20%20%20%20%20%20%20%20val_csv.unlink(missing_ok%3DTrue)%0A%0A%20%20%20%20%20%20%20%20def%20predict(self%2C%20X_test%3A%20list%5Bstr%5D)%20-%3E%20np.ndarray%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20%22%22%22%0A%20%20%20%20%20%20%20%20%20%20%20%20Run%20inference%20by%20calling%20%60chemprop%20predict%60%20via%20subprocess.%0A%0A%20%20%20%20%20%20%20%20%20%20%20%20Writes%20a%20temporary%20SMILES%20CSV%2C%20runs%20the%20CLI%2C%20reads%20the%20output%20CSV%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20then%20removes%20both%20temporary%20files.%0A%0A%20%20%20%20%20%20%20%20%20%20%20%20Args%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20X_test%3A%20SMILES%20strings%20to%20predict.%0A%0A%20%20%20%20%20%20%20%20%20%20%20%20Returns%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%201-D%20numpy%20array%20of%20predicted%20values.%0A%20%20%20%20%20%20%20%20%20%20%20%20%22%22%22%0A%20%20%20%20%20%20%20%20%20%20%20%20tmp%20%3D%20Path(tempfile.gettempdir())%0A%20%20%20%20%20%20%20%20%20%20%20%20test_csv%20%3D%20tmp%20%20%2F%20%22chemprop_test.csv%22%0A%20%20%20%20%20%20%20%20%20%20%20%20pred_csv%20%3D%20tmp%20%20%2F%20%22chemprop_preds.csv%22%0A%20%20%20%20%20%20%20%20%20%20%20%20%23%20The%20best.pt%20written%20by%20%60chemprop%20train%60%20into%20model_dir%2Fmodel_0%2F%0A%20%20%20%20%20%20%20%20%20%20%20%20model_pt%20%3D%20self.model_dir%20%2F%20%22model_0%22%20%2F%20%22best.pt%22%0A%0A%20%20%20%20%20%20%20%20%20%20%20%20_write_smiles_csv(X_test%2C%20None%2C%20test_csv%2C%20self.target_col)%0A%0A%20%20%20%20%20%20%20%20%20%20%20%20_run_chemprop_cli(%5B%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%22predict%22%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%22--test-path%22%2C%20%20str(test_csv)%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%22--model-path%22%2C%20str(model_pt)%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%22--preds-path%22%2C%20str(pred_csv)%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20%5D)%0A%0A%20%20%20%20%20%20%20%20%20%20%20%20preds%20%3D%20pl.read_csv(pred_csv)%5Bself.target_col%5D.to_numpy()%0A%0A%20%20%20%20%20%20%20%20%20%20%20%20test_csv.unlink(missing_ok%3DTrue)%0A%20%20%20%20%20%20%20%20%20%20%20%20pred_csv.unlink(missing_ok%3DTrue)%0A%0A%20%20%20%20%20%20%20%20%20%20%20%20return%20preds.flatten()%0A%0A%20%20%20%20class%20ChempropChemeleonModel%3A%0A%20%20%20%20%20%20%20%20%22%22%22%0A%20%20%20%20%20%20%20%20Chemprop%20D-MPNN%20fine-tuned%20from%20the%20CheMeleon%20pretrained%20backbone%20via%20the%20CLI.%0A%0A%20%20%20%20%20%20%20%20Identical%20interface%20to%20ChempropModel%20but%20passes%20%60--from-foundation%20CHEMELEON%60%0A%20%20%20%20%20%20%20%20to%20%60chemprop%20train%60.%20%20The%20CLI%20downloads%20and%20caches%20the%20CheMeleon%20weights%0A%20%20%20%20%20%20%20%20automatically%20at%20~%2F.chemprop%2Fchemeleon_mp.pt%20on%20the%20first%20call.%0A%0A%20%20%20%20%20%20%20%20Reference%3A%20https%3A%2F%2Fgithub.com%2FJacksonBurns%2Fchemeleon%0A%20%20%20%20%20%20%20%20%22%22%22%0A%0A%20%20%20%20%20%20%20%20def%20__init__(%0A%20%20%20%20%20%20%20%20%20%20%20%20self%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20pred_type%3A%20str%20%3D%20%22regression%22%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20model_dir%3A%20Path%20%3D%20_CHEMELEON_MODEL_DIR%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20epochs%3A%20int%20%3D%2050%2C%0A%20%20%20%20%20%20%20%20)%20-%3E%20None%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20%22%22%22%0A%20%20%20%20%20%20%20%20%20%20%20%20Args%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20pred_type%3A%20%22regression%22%20or%20%22classification%22.%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20model_dir%3A%20Directory%20where%20the%20CLI%20writes%20model%20checkpoints.%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20Distinct%20from%20ChempropModel's%20default%20to%20avoid%20collisions.%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20epochs%3A%20Maximum%20number%20of%20training%20epochs.%0A%20%20%20%20%20%20%20%20%20%20%20%20%22%22%22%0A%20%20%20%20%20%20%20%20%20%20%20%20if%20pred_type%20not%20in%20(%22regression%22%2C%20%22classification%22)%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20raise%20ValueError(%22pred_type%20must%20be%20'regression'%20or%20'classification'%22)%0A%20%20%20%20%20%20%20%20%20%20%20%20self.pred_type%20%20%3D%20pred_type%0A%20%20%20%20%20%20%20%20%20%20%20%20self.model_dir%20%20%3D%20model_dir%0A%20%20%20%20%20%20%20%20%20%20%20%20self.epochs%20%20%20%20%20%3D%20epochs%0A%20%20%20%20%20%20%20%20%20%20%20%20self.target_col%3A%20Optional%5Bstr%5D%20%3D%20None%0A%0A%20%20%20%20%20%20%20%20def%20train(%0A%20%20%20%20%20%20%20%20%20%20%20%20self%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20X_train%3A%20list%5Bstr%5D%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20y_train%3A%20np.ndarray%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20X_val%3A%20%20%20list%5Bstr%5D%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20y_val%3A%20%20%20np.ndarray%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20target_col%3A%20str%20%3D%20%22target%22%2C%0A%20%20%20%20%20%20%20%20)%20-%3E%20None%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20%22%22%22%0A%20%20%20%20%20%20%20%20%20%20%20%20Fine-tune%20from%20CheMeleon%20by%20calling%20%60chemprop%20train%20--from-foundation%20CHEMELEON%60.%0A%0A%20%20%20%20%20%20%20%20%20%20%20%20Args%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20X_train%3A%20SMILES%20strings%20for%20training.%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20y_train%3A%20Training%20targets%2C%20shape%20(n%2C)%20or%20(n%2C%201).%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20X_val%3A%20%20%20SMILES%20strings%20for%20validation%20(early%20stopping).%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20y_val%3A%20%20%20Validation%20targets%2C%20shape%20(n%2C)%20or%20(n%2C%201).%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20target_col%3A%20Column%20name%20used%20in%20the%20temporary%20CSV%20files.%0A%20%20%20%20%20%20%20%20%20%20%20%20%22%22%22%0A%20%20%20%20%20%20%20%20%20%20%20%20self.target_col%20%3D%20target_col%0A%20%20%20%20%20%20%20%20%20%20%20%20tmp%20%3D%20Path(tempfile.gettempdir())%0A%20%20%20%20%20%20%20%20%20%20%20%20train_csv%20%3D%20tmp%20%2F%20%22chemeleon_train.csv%22%0A%20%20%20%20%20%20%20%20%20%20%20%20val_csv%20%20%20%3D%20tmp%20%2F%20%22chemeleon_val.csv%22%0A%0A%20%20%20%20%20%20%20%20%20%20%20%20_write_smiles_csv(X_train%2C%20y_train%2C%20train_csv%2C%20target_col)%0A%20%20%20%20%20%20%20%20%20%20%20%20_write_smiles_csv(X_val%2C%20%20%20y_val%2C%20%20%20val_csv%2C%20%20%20target_col)%0A%0A%20%20%20%20%20%20%20%20%20%20%20%20if%20self.model_dir.exists()%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20shutil.rmtree(self.model_dir)%0A%0A%20%20%20%20%20%20%20%20%20%20%20%20task_type%20%3D%20%22regression%22%20if%20self.pred_type%20%3D%3D%20%22regression%22%20else%20%22binary%22%0A%20%20%20%20%20%20%20%20%20%20%20%20%23%20Pass%20val_csv%20twice%20(as%20val%20and%20as%20dummy%20test)%20%E2%80%94%20same%20reason%20as%20ChempropModel.%0A%20%20%20%20%20%20%20%20%20%20%20%20_run_chemprop_cli(%5B%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%22train%22%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%22--data-path%22%2C%20str(train_csv)%2C%20str(val_csv)%2C%20str(val_csv)%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%22--smiles-columns%22%2C%20%22smiles%22%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%22--target-columns%22%2C%20target_col%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%22--task-type%22%2C%20task_type%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%22--accelerator%22%2C%20_get_device()%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%22--epochs%22%2C%20str(self.epochs)%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%22--from-foundation%22%2C%20%22CHEMELEON%22%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%22--save-dir%22%2C%20str(self.model_dir)%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20%5D)%0A%0A%20%20%20%20%20%20%20%20%20%20%20%20train_csv.unlink(missing_ok%3DTrue)%0A%20%20%20%20%20%20%20%20%20%20%20%20val_csv.unlink(missing_ok%3DTrue)%0A%0A%20%20%20%20%20%20%20%20def%20predict(self%2C%20X_test%3A%20list%5Bstr%5D)%20-%3E%20np.ndarray%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20%22%22%22%0A%20%20%20%20%20%20%20%20%20%20%20%20Run%20inference%20by%20calling%20%60chemprop%20predict%60%20via%20subprocess.%0A%0A%20%20%20%20%20%20%20%20%20%20%20%20Args%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20X_test%3A%20SMILES%20strings%20to%20predict.%0A%0A%20%20%20%20%20%20%20%20%20%20%20%20Returns%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%201-D%20numpy%20array%20of%20predicted%20values.%0A%20%20%20%20%20%20%20%20%20%20%20%20%22%22%22%0A%20%20%20%20%20%20%20%20%20%20%20%20tmp%20%3D%20Path(tempfile.gettempdir())%0A%20%20%20%20%20%20%20%20%20%20%20%20test_csv%20%3D%20tmp%20%20%2F%20%22chemeleon_test.csv%22%0A%20%20%20%20%20%20%20%20%20%20%20%20pred_csv%20%3D%20tmp%20%20%2F%20%22chemeleon_preds.csv%22%0A%20%20%20%20%20%20%20%20%20%20%20%20model_pt%20%3D%20self.model_dir%20%2F%20%22model_0%22%20%2F%20%22best.pt%22%0A%0A%20%20%20%20%20%20%20%20%20%20%20%20_write_smiles_csv(X_test%2C%20None%2C%20test_csv%2C%20self.target_col)%0A%0A%20%20%20%20%20%20%20%20%20%20%20%20_run_chemprop_cli(%5B%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%22predict%22%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%22--test-path%22%2C%20%20str(test_csv)%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%22--model-path%22%2C%20str(model_pt)%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%22--preds-path%22%2C%20str(pred_csv)%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20%5D)%0A%0A%20%20%20%20%20%20%20%20%20%20%20%20preds%20%3D%20pl.read_csv(pred_csv)%5Bself.target_col%5D.to_numpy()%0A%0A%20%20%20%20%20%20%20%20%20%20%20%20test_csv.unlink(missing_ok%3DTrue)%0A%20%20%20%20%20%20%20%20%20%20%20%20pred_csv.unlink(missing_ok%3DTrue)%0A%0A%20%20%20%20%20%20%20%20%20%20%20%20return%20preds.flatten()%0A%0A%20%20%20%20return%20ChempropChemeleonModel%2C%20ChempropModel%0A%0A%0A%40app.cell%0Adef%20_(%0A%20%20%20%20accuracy_score%2C%0A%20%20%20%20balanced_accuracy_score%2C%0A%20%20%20%20f1_score%2C%0A%20%20%20%20matthews_corrcoef%2C%0A%20%20%20%20mean_absolute_error%2C%0A%20%20%20%20mean_squared_error%2C%0A%20%20%20%20np%2C%0A%20%20%20%20precision_score%2C%0A%20%20%20%20r2_score%2C%0A%20%20%20%20recall_score%2C%0A%20%20%20%20roc_auc_score%2C%0A%20%20%20%20spearmanr%2C%0A%20%20%20%20warnings%2C%0A)%3A%0A%20%20%20%20_classification_metrics%20%3D%20%7B%0A%20%20%20%20%20%20%20%20%22accuracy%22%3A%20accuracy_score%2C%0A%20%20%20%20%20%20%20%20%22balanced_accuracy%22%3A%20balanced_accuracy_score%2C%0A%20%20%20%20%20%20%20%20%22precision%22%3A%20precision_score%2C%0A%20%20%20%20%20%20%20%20%22recall%22%3A%20recall_score%2C%0A%20%20%20%20%20%20%20%20%22f1%22%3A%20f1_score%2C%0A%20%20%20%20%20%20%20%20%22mcc%22%3A%20matthews_corrcoef%2C%0A%20%20%20%20%7D%0A%0A%20%20%20%20def%20_safe_spearmanr(y_true%2C%20y_pred)%3A%0A%20%20%20%20%20%20%20%20with%20warnings.catch_warnings()%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20warnings.simplefilter(%22ignore%22)%0A%20%20%20%20%20%20%20%20%20%20%20%20return%20spearmanr(y_true%2C%20y_pred).correlation%0A%0A%20%20%20%20_regression_metrics%20%3D%20%7B%0A%20%20%20%20%20%20%20%20%22r2%22%3A%20r2_score%2C%0A%20%20%20%20%20%20%20%20%22rho%22%3A%20lambda%20y_true%2C%20y_pred%3A%20_safe_spearmanr(y_true%2C%20y_pred)%2C%0A%20%20%20%20%20%20%20%20%22mse%22%3A%20mean_squared_error%2C%0A%20%20%20%20%20%20%20%20%22rmse%22%3A%20lambda%20y_true%2C%20y_pred%3A%20np.sqrt(mean_squared_error(y_true%2C%20y_pred))%2C%0A%20%20%20%20%20%20%20%20%22mae%22%3A%20mean_absolute_error%2C%0A%20%20%20%20%7D%0A%0A%20%20%20%20def%20evaluate_predictions(%0A%20%20%20%20%20%20%20%20y_pred%3A%20np.ndarray%2C%0A%20%20%20%20%20%20%20%20y_test%3A%20np.ndarray%2C%0A%20%20%20%20%20%20%20%20pred_type%3A%20str%2C%0A%20%20%20%20%20%20%20%20thr%3A%20float%20%3D%200.5%2C%0A%20%20%20%20)%20-%3E%20dict%5Bstr%2C%20float%5D%3A%0A%20%20%20%20%20%20%20%20%22%22%22%0A%20%20%20%20%20%20%20%20Compute%20a%20standard%20set%20of%20metrics%20for%20either%20classification%20or%20regression.%0A%0A%20%20%20%20%20%20%20%20Args%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20y_pred%3A%20Model%20predictions.%20For%20classification%2C%20these%20should%20be%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20probability%20scores%20(0%E2%80%931)%3B%20for%20regression%2C%20continuous%20values.%0A%20%20%20%20%20%20%20%20%20%20%20%20y_test%3A%20Ground-truth%20labels%20or%20values.%0A%20%20%20%20%20%20%20%20%20%20%20%20pred_type%3A%20%22classification%22%20or%20%22regression%22.%0A%20%20%20%20%20%20%20%20%20%20%20%20thr%3A%20Decision%20threshold%20applied%20to%20y_pred%20for%20binary%20classification%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20metrics.%20Ignored%20for%20regression.%0A%0A%20%20%20%20%20%20%20%20Returns%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20Dictionary%20mapping%20metric%20names%20to%20their%20computed%20values.%0A%20%20%20%20%20%20%20%20%20%20%20%20Classification%20metrics%3A%20accuracy%2C%20balanced_accuracy%2C%20precision%2C%20recall%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20f1%2C%20mcc%2C%20roc_auc.%20Regression%20metrics%3A%20r2%2C%20rho%2C%20mse%2C%20rmse%2C%20mae.%0A%20%20%20%20%20%20%20%20%22%22%22%0A%20%20%20%20%20%20%20%20if%20pred_type%20%3D%3D%20%22classification%22%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20out%20%3D%20%7B%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20metric%3A%20_classification_metrics%5Bmetric%5D(y_test%2C%20y_pred%20%3E%20thr)%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20for%20metric%20in%20_classification_metrics%0A%20%20%20%20%20%20%20%20%20%20%20%20%7D%0A%20%20%20%20%20%20%20%20%20%20%20%20out%5B%22roc_auc%22%5D%20%3D%20roc_auc_score(y_test%2C%20y_pred)%0A%20%20%20%20%20%20%20%20%20%20%20%20return%20out%0A%20%20%20%20%20%20%20%20else%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20return%20%7B%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20metric%3A%20_regression_metrics%5Bmetric%5D(y_test%2C%20y_pred)%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20for%20metric%20in%20_regression_metrics%0A%20%20%20%20%20%20%20%20%20%20%20%20%7D%0A%0A%20%20%20%20return%0A%0A%0A%40app.cell%0Adef%20_(%0A%20%20%20%20AtomPairFingerprint%2C%0A%20%20%20%20AvalonFingerprint%2C%0A%20%20%20%20ConformerGenerator%2C%0A%20%20%20%20E3FPFingerprint%2C%0A%20%20%20%20ECFPFingerprint%2C%0A%20%20%20%20MACCSFingerprint%2C%0A%20%20%20%20MQNsFingerprint%2C%0A%20%20%20%20MolFromSmilesTransformer%2C%0A%20%20%20%20MordredFingerprint%2C%0A%20%20%20%20PubChemFingerprint%2C%0A%20%20%20%20RDKitFingerprint%2C%0A%20%20%20%20TopologicalTorsionFingerprint%2C%0A%20%20%20%20pl%2C%0A)%3A%0A%20%20%20%20_fp_dict%20%3D%20%7B%0A%20%20%20%20%20%20%20%20%22ecfp%22%3A%20ECFPFingerprint%2C%0A%20%20%20%20%20%20%20%20%22morgan%22%3A%20ECFPFingerprint%2C%0A%20%20%20%20%20%20%20%20%22maccs%22%3A%20MACCSFingerprint%2C%0A%20%20%20%20%20%20%20%20%22torsion%22%3A%20TopologicalTorsionFingerprint%2C%0A%20%20%20%20%20%20%20%20%22rdkit%22%3A%20RDKitFingerprint%2C%0A%20%20%20%20%20%20%20%20%22atompair%22%3A%20AtomPairFingerprint%2C%0A%20%20%20%20%20%20%20%20%22avalon%22%3A%20AvalonFingerprint%2C%0A%20%20%20%20%20%20%20%20%22e3fp%22%3A%20E3FPFingerprint%2C%0A%20%20%20%20%20%20%20%20%22mordred%22%3A%20MordredFingerprint%2C%0A%20%20%20%20%20%20%20%20%22mqn%22%3A%20MQNsFingerprint%2C%0A%20%20%20%20%20%20%20%20%22pubchem%22%3A%20PubChemFingerprint%2C%0A%20%20%20%20%7D%0A%0A%20%20%20%20def%20generate_fingerprint(df%3A%20pl.DataFrame%2C%20fingerprint_type%3A%20str%2C%20**kwargs)%20-%3E%20pl.DataFrame%3A%0A%20%20%20%20%20%20%20%20%22%22%22%0A%20%20%20%20%20%20%20%20Generate%20molecular%20fingerprints%20using%20scikit-fingerprints%20and%20add%20them%20as%20a%20column.%0A%0A%20%20%20%20%20%20%20%20Dispatches%20to%20the%20appropriate%20skfp%20fingerprint%20class%20based%20on%20fingerprint_type.%0A%20%20%20%20%20%20%20%20For%20fingerprint%20types%20that%20require%203D%20conformers%20(e.g.%2C%20E3FP)%2C%20conformers%20are%0A%20%20%20%20%20%20%20%20generated%20automatically%20via%20RDKit%20ETKDGv3.%0A%0A%20%20%20%20%20%20%20%20Args%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20df%3A%20Polars%20DataFrame%20containing%20a%20%22smiles%22%20column.%0A%20%20%20%20%20%20%20%20%20%20%20%20fingerprint_type%3A%20One%20of%20the%20supported%20types%3A%20%22ecfp%22%2F%22morgan%22%2C%20%22maccs%22%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%22torsion%22%2C%20%22rdkit%22%2C%20%22atompair%22%2C%20%22avalon%22%2C%20%22e3fp%22%2C%20%22mordred%22%2C%20%22mqn%22%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%22pubchem%22.%0A%20%20%20%20%20%20%20%20%20%20%20%20**kwargs%3A%20Additional%20keyword%20arguments%20forwarded%20to%20the%20skfp%20fingerprint%20class%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20constructor%20(e.g.%2C%20radius%3D3%2C%20n_bits%3D1024%20for%20ECFP).%0A%0A%20%20%20%20%20%20%20%20Returns%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20DataFrame%20with%20an%20added%20column%20named%20after%20fingerprint_type%20containing%0A%20%20%20%20%20%20%20%20%20%20%20%20the%20computed%20fingerprint%20arrays.%0A%0A%20%20%20%20%20%20%20%20Raises%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20ValueError%3A%20If%20fingerprint_type%20is%20not%20a%20recognized%20key.%0A%20%20%20%20%20%20%20%20%22%22%22%0A%20%20%20%20%20%20%20%20if%20fingerprint_type%20not%20in%20_fp_dict.keys()%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20raise%20ValueError(%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20f%22Fingerprint%20type%20not%20recognized%3A%20%7Bfingerprint_type!r%7D.%20%22%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20f%22Valid%20values%3A%20%7Blist(_fp_dict.keys())%7D%22%0A%20%20%20%20%20%20%20%20%20%20%20%20)%0A%0A%20%20%20%20%20%20%20%20if%20len(kwargs)%20%3D%3D%200%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20fp_func%20%3D%20_fp_dict%5Bfingerprint_type%5D()%0A%20%20%20%20%20%20%20%20else%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20fp_func%20%3D%20_fp_dict%5Bfingerprint_type%5D(**kwargs)%0A%0A%20%20%20%20%20%20%20%20if%20fp_func.requires_conformers%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20mol_from_smiles%20%3D%20MolFromSmilesTransformer()%0A%20%20%20%20%20%20%20%20%20%20%20%20conf_gen%20%3D%20ConformerGenerator()%0A%20%20%20%20%20%20%20%20%20%20%20%20mols_list%20%3D%20mol_from_smiles.transform(df.get_column(%22smiles%22))%0A%20%20%20%20%20%20%20%20%20%20%20%20mols_list%20%3D%20conf_gen.transform(mols_list)%0A%20%20%20%20%20%20%20%20else%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20mols_list%20%3D%20df.get_column(%22smiles%22)%0A%0A%20%20%20%20%20%20%20%20fps%20%3D%20fp_func.transform(mols_list)%0A%20%20%20%20%20%20%20%20fps_col%20%3D%20pl.Series(values%3Dfps%2C%20name%3Dfingerprint_type)%0A%20%20%20%20%20%20%20%20fps%20%3D%20df.with_columns(fps_col)%0A%20%20%20%20%20%20%20%20return%20fps%0A%0A%20%20%20%20return%20(generate_fingerprint%2C)%0A%0A%0A%40app.cell%0Adef%20_(mo)%3A%0A%20%20%20%20mo.md(r%22%22%22%0A%20%20%20%20%23%23%20Data%20splitting%20utilities%0A%0A%20%20%20%20Three%20complementary%20CV%20strategies%20are%20implemented%20here%20to%20assess%20how%20well%20models%0A%20%20%20%20generalise%20across%20different%20notions%20of%20%22unseen%22%20data%3A%0A%0A%20%20%20%20%7C%20Strategy%20%7C%20What%20it%20tests%20%7C%0A%20%20%20%20%7C---%7C---%7C%0A%20%20%20%20%7C%20**Random**%20%7C%20Baseline%20%E2%80%94%20molecules%20are%20shuffled%20at%20random%20across%20folds%20%7C%0A%20%20%20%20%7C%20**Scaffold**%20%7C%20Chemical%20generalisation%20%E2%80%94%20folds%20are%20split%20by%20Bemis%E2%80%93Murcko%20scaffold%20so%20the%20model%20never%20sees%20a%20scaffold%20at%20test%20time%20that%20it%20trained%20on%20%7C%0A%20%20%20%20%7C%20**Temporal**%20%7C%20Prospective%20generalisation%20%E2%80%94%20molecules%20are%20ordered%20by%20their%20numeric%20ID%20(a%20proxy%20for%20acquisition%20time)%20%20%7C%0A%0A%20%20%20%20Random%20and%20scaffold%20CV%20share%20the%20nested%20generator%20interface%3A%0A%20%20%20%20%60(fold_index%2C%20outer_index%2C%20inner_index%2C%20train_df%2C%20val_df%2C%20test_df)%60%0A%0A%20%20%20%20Temporal%20CV%20uses%20a%20simpler%20walk-forward%20interface%3A%0A%20%20%20%20%60(fold_index%2C%20train_df%2C%20val_df%2C%20test_df)%60%0A%20%20%20%20%22%22%22)%0A%20%20%20%20return%0A%0A%0A%40app.cell%0Adef%20_(BaseKFold%2C%20Iterator%2C%20Optional%2C%20np%2C%20pl)%3A%0A%20%20%20%20%23%20%E2%94%80%E2%94%80%20helpers%20%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%0A%0A%20%20%20%20def%20split_dataset_random(%0A%20%20%20%20%20%20%20%20df%3A%20pl.DataFrame%2C%0A%20%20%20%20%20%20%20%20p_test%3A%20float%20%3D%200.2%2C%0A%20%20%20%20%20%20%20%20seed%3A%20int%20%3D%2042%2C%0A%20%20%20%20)%20-%3E%20tuple%5Bpl.DataFrame%2C%20pl.DataFrame%5D%3A%0A%20%20%20%20%20%20%20%20%22%22%22%0A%20%20%20%20%20%20%20%20Randomly%20split%20a%20DataFrame%20into%20train%20and%20test%20subsets.%0A%0A%20%20%20%20%20%20%20%20Args%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20df%3A%20Input%20DataFrame.%0A%20%20%20%20%20%20%20%20%20%20%20%20p_test%3A%20Fraction%20of%20rows%20allocated%20to%20the%20test%20set.%0A%20%20%20%20%20%20%20%20%20%20%20%20seed%3A%20Random%20seed%20for%20reproducibility.%0A%0A%20%20%20%20%20%20%20%20Returns%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20Tuple%20of%20(train_df%2C%20test_df).%0A%20%20%20%20%20%20%20%20%22%22%22%0A%20%20%20%20%20%20%20%20rng%20%3D%20np.random.default_rng(seed)%0A%20%20%20%20%20%20%20%20idx%20%3D%20rng.permutation(df.shape%5B0%5D)%0A%20%20%20%20%20%20%20%20n_test%20%3D%20int(len(idx)%20*%20p_test)%0A%20%20%20%20%20%20%20%20test_idx%2C%20train_idx%20%3D%20idx%5B%3An_test%5D%2C%20idx%5Bn_test%3A%5D%0A%20%20%20%20%20%20%20%20return%20df%5Btrain_idx%5D.clone()%2C%20df%5Btest_idx%5D.clone()%0A%0A%20%20%20%20%23%20%E2%94%80%E2%94%80%20GroupKFoldShuffle%20%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%0A%0A%20%20%20%20class%20GroupKFoldShuffle(BaseKFold)%3A%0A%20%20%20%20%20%20%20%20%22%22%22%0A%20%20%20%20%20%20%20%20K-fold%20cross-validator%20that%20respects%20group%20boundaries%20and%20supports%20shuffling.%0A%0A%20%20%20%20%20%20%20%20An%20extension%20of%20scikit-learn's%20GroupKFold%20that%20adds%20optional%20shuffling%20of%0A%20%20%20%20%20%20%20%20groups%20before%20splitting.%20Useful%20for%20scaffold-aware%20cross-validation%20where%0A%20%20%20%20%20%20%20%20you%20want%20reproducible%20but%20shuffled%20group%20assignments.%0A%0A%20%20%20%20%20%20%20%20Args%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20n_splits%3A%20Number%20of%20folds.%0A%20%20%20%20%20%20%20%20%20%20%20%20shuffle%3A%20Whether%20to%20shuffle%20groups%20before%20splitting.%0A%20%20%20%20%20%20%20%20%20%20%20%20random_state%3A%20Random%20seed%20used%20when%20shuffle%3DTrue.%0A%20%20%20%20%20%20%20%20%22%22%22%0A%0A%20%20%20%20%20%20%20%20def%20__init__(%0A%20%20%20%20%20%20%20%20%20%20%20%20self%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20n_splits%3A%20int%20%3D%205%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20*%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20shuffle%3A%20bool%20%3D%20False%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20random_state%3A%20Optional%5Bint%5D%20%3D%20None%2C%0A%20%20%20%20%20%20%20%20)%20-%3E%20None%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20super().__init__(n_splits%3Dn_splits%2C%20shuffle%3Dshuffle%2C%20random_state%3Drandom_state)%0A%0A%20%20%20%20%20%20%20%20def%20split(self%2C%20X%2C%20y%3DNone%2C%20groups%3DNone)%20-%3E%20Iterator%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20%23%20Collect%20unique%20groups%2C%20then%20optionally%20shuffle%20them%20so%20that%20fold%0A%20%20%20%20%20%20%20%20%20%20%20%20%23%20assignment%20is%20randomised%20while%20still%20keeping%20each%20group%20intact.%0A%20%20%20%20%20%20%20%20%20%20%20%20unique_groups%20%3D%20np.unique(groups)%0A%0A%20%20%20%20%20%20%20%20%20%20%20%20if%20self.shuffle%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20rng%20%3D%20np.random.default_rng(self.random_state)%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20unique_groups%20%3D%20rng.permutation(unique_groups)%0A%0A%20%20%20%20%20%20%20%20%20%20%20%20%23%20Distribute%20groups%20as%20evenly%20as%20possible%20across%20folds.%0A%20%20%20%20%20%20%20%20%20%20%20%20split_groups%20%3D%20np.array_split(unique_groups%2C%20self.n_splits)%0A%0A%20%20%20%20%20%20%20%20%20%20%20%20for%20test_group_ids%20in%20split_groups%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20test_mask%20%3D%20np.isin(groups%2C%20test_group_ids)%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20train_mask%20%3D%20~test_mask%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20yield%20np.where(train_mask)%5B0%5D%2C%20np.where(test_mask)%5B0%5D%0A%0A%20%20%20%20return%20GroupKFoldShuffle%2C%20split_dataset_random%0A%0A%0A%40app.cell%0Adef%20_(GroupKFoldShuffle%2C%20Iterator%2C%20pl%2C%20split_dataset_random)%3A%0A%20%20%20%20def%20generate_cv_splits_random(%0A%20%20%20%20%20%20%20%20df%3A%20pl.DataFrame%2C%0A%20%20%20%20%20%20%20%20n_outer%3A%20int%20%3D%205%2C%0A%20%20%20%20%20%20%20%20n_inner%3A%20int%20%3D%205%2C%0A%20%20%20%20%20%20%20%20seed%3A%20int%20%3D%2042%2C%0A%20%20%20%20%20%20%20%20p_val%3A%20float%20%3D%200%2C%0A%20%20%20%20)%20-%3E%20Iterator%3A%0A%20%20%20%20%20%20%20%20%22%22%22%0A%20%20%20%20%20%20%20%20Generate%20nested%205%C3%975%20CV%20splits%20using%20a%20**random**%20molecule%20assignment.%0A%0A%20%20%20%20%20%20%20%20Each%20molecule%20is%20treated%20as%20its%20own%20group%2C%20so%20folds%20are%20purely%20random.%0A%20%20%20%20%20%20%20%20This%20is%20the%20baseline%20split%20strategy%3A%20it%20gives%20optimistic%20estimates%20of%0A%20%20%20%20%20%20%20%20generalisation%20because%20train%20and%20test%20scaffolds%20can%20overlap.%0A%0A%20%20%20%20%20%20%20%20Args%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20df%3A%20Polars%20DataFrame%20to%20split.%0A%20%20%20%20%20%20%20%20%20%20%20%20n_outer%3A%20Number%20of%20outer%20CV%20folds.%0A%20%20%20%20%20%20%20%20%20%20%20%20n_inner%3A%20Number%20of%20inner%20CV%20folds%20per%20outer%20iteration.%0A%20%20%20%20%20%20%20%20%20%20%20%20seed%3A%20Random%20seed%20for%20GroupKFoldShuffle.%0A%20%20%20%20%20%20%20%20%20%20%20%20p_val%3A%20Fraction%20of%20the%20training%20set%20reserved%20as%20a%20validation%20split.%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%200%20disables%20the%20validation%20split%20(val_df%20is%20yielded%20as%20None).%0A%0A%20%20%20%20%20%20%20%20Yields%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20Tuples%20of%20(fold_index%2C%20outer_index%2C%20inner_index%2C%20train_df%2C%20val_df%2C%20test_df).%0A%20%20%20%20%20%20%20%20%22%22%22%0A%20%20%20%20%20%20%20%20for%20i%20in%20range(n_outer)%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20kf%20%3D%20GroupKFoldShuffle(n_splits%3Dn_inner%2C%20random_state%3Dseed%20%2B%20i%2C%20shuffle%3DTrue)%0A%20%20%20%20%20%20%20%20%20%20%20%20groups%20%3D%20list(range(df.shape%5B0%5D))%20%20%23%20each%20molecule%20is%20its%20own%20group%0A%20%20%20%20%20%20%20%20%20%20%20%20for%20j%2C%20(train_idx%2C%20test_idx)%20in%20enumerate(kf.split(df%2C%20groups%3Dgroups))%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20fold%20%3D%20i%20*%20n_inner%20%2B%20j%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20train%20%3D%20df%5Btrain_idx%5D.clone()%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20test%20%3D%20df%5Btest_idx%5D.clone()%0A%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20val%20%3D%20None%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20if%20p_val%20%3E%200%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20train%2C%20val%20%3D%20split_dataset_random(train%2C%20p_test%3Dp_val%2C%20seed%3Dseed%20%2B%20fold)%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20yield%20fold%2C%20i%2C%20j%2C%20train%2C%20val%2C%20test%0A%0A%20%20%20%20return%20(generate_cv_splits_random%2C)%0A%0A%0A%40app.cell%0Adef%20_(%0A%20%20%20%20Chem%2C%0A%20%20%20%20GroupKFoldShuffle%2C%0A%20%20%20%20Iterator%2C%0A%20%20%20%20MurckoScaffold%2C%0A%20%20%20%20np%2C%0A%20%20%20%20pl%2C%0A%20%20%20%20split_dataset_random%2C%0A)%3A%0A%20%20%20%20def%20_get_bemis_murcko_scaffold(smiles%3A%20str)%20-%3E%20str%3A%0A%20%20%20%20%20%20%20%20%22%22%22%0A%20%20%20%20%20%20%20%20Return%20the%20canonical%20Bemis%E2%80%93Murcko%20scaffold%20SMILES%20for%20a%20molecule.%0A%0A%20%20%20%20%20%20%20%20Molecules%20that%20fail%20to%20parse%20or%20have%20no%20ring%20system%20return%20an%20empty%20string%2C%0A%20%20%20%20%20%20%20%20which%20causes%20them%20to%20be%20pooled%20together%20into%20a%20single%20%22no%20scaffold%22%20group.%0A%0A%20%20%20%20%20%20%20%20Args%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20smiles%3A%20Input%20SMILES%20string.%0A%0A%20%20%20%20%20%20%20%20Returns%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20Canonical%20scaffold%20SMILES%2C%20or%20%22%22%20on%20failure.%0A%20%20%20%20%20%20%20%20%22%22%22%0A%20%20%20%20%20%20%20%20mol%20%3D%20Chem.MolFromSmiles(smiles)%0A%20%20%20%20%20%20%20%20if%20mol%20is%20None%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20return%20%22%22%0A%20%20%20%20%20%20%20%20scaffold%20%3D%20MurckoScaffold.GetScaffoldForMol(mol)%0A%20%20%20%20%20%20%20%20return%20Chem.MolToSmiles(scaffold%2C%20canonical%3DTrue)%0A%0A%20%20%20%20def%20generate_cv_splits_scaffold(%0A%20%20%20%20%20%20%20%20df%3A%20pl.DataFrame%2C%0A%20%20%20%20%20%20%20%20n_outer%3A%20int%20%3D%205%2C%0A%20%20%20%20%20%20%20%20n_inner%3A%20int%20%3D%205%2C%0A%20%20%20%20%20%20%20%20seed%3A%20int%20%3D%2042%2C%0A%20%20%20%20%20%20%20%20p_val%3A%20float%20%3D%200%2C%0A%20%20%20%20)%20-%3E%20Iterator%3A%0A%20%20%20%20%20%20%20%20%22%22%22%0A%20%20%20%20%20%20%20%20Generate%20nested%205%C3%975%20CV%20splits%20using%20**Bemis%E2%80%93Murcko%20scaffold**%20assignment.%0A%0A%20%20%20%20%20%20%20%20All%20molecules%20sharing%20the%20same%20scaffold%20are%20kept%20in%20the%20same%20fold%2C%20so%20the%0A%20%20%20%20%20%20%20%20model%20never%20encounters%20a%20scaffold%20at%20test%20time%20that%20appeared%20during%20training.%0A%20%20%20%20%20%20%20%20This%20gives%20a%20more%20realistic%20estimate%20of%20performance%20on%20genuinely%20novel%0A%20%20%20%20%20%20%20%20chemical%20series.%0A%0A%20%20%20%20%20%20%20%20Scaffold%20groups%20are%20shuffled%20(but%20not%20ordered%20by%20size)%20before%20fold%20assignment%0A%20%20%20%20%20%20%20%20so%20that%20very%20large%20scaffolds%20are%20randomly%20distributed.%0A%0A%20%20%20%20%20%20%20%20Args%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20df%3A%20Polars%20DataFrame%20containing%20a%20%22smiles%22%20column.%0A%20%20%20%20%20%20%20%20%20%20%20%20n_outer%3A%20Number%20of%20outer%20CV%20folds.%0A%20%20%20%20%20%20%20%20%20%20%20%20n_inner%3A%20Number%20of%20inner%20CV%20folds%20per%20outer%20iteration.%0A%20%20%20%20%20%20%20%20%20%20%20%20seed%3A%20Random%20seed%20for%20GroupKFoldShuffle.%0A%20%20%20%20%20%20%20%20%20%20%20%20p_val%3A%20Fraction%20of%20the%20training%20set%20reserved%20as%20a%20validation%20split.%0A%0A%20%20%20%20%20%20%20%20Yields%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20Tuples%20of%20(fold_index%2C%20outer_index%2C%20inner_index%2C%20train_df%2C%20val_df%2C%20test_df).%0A%20%20%20%20%20%20%20%20%22%22%22%0A%20%20%20%20%20%20%20%20%23%20Compute%20scaffold%20label%20for%20every%20molecule%20once%20(reused%20across%20all%20folds).%0A%20%20%20%20%20%20%20%20scaffolds%20%3D%20%5B_get_bemis_murcko_scaffold(s)%20for%20s%20in%20df%5B%22smiles%22%5D.to_list()%5D%0A%20%20%20%20%20%20%20%20%23%20Map%20each%20unique%20scaffold%20SMILES%20to%20an%20integer%20group%20id.%0A%20%20%20%20%20%20%20%20unique_scaffolds%20%3D%20list(dict.fromkeys(scaffolds))%20%20%23%20preserves%20first-seen%20order%0A%20%20%20%20%20%20%20%20scaffold_to_id%20%3D%20%7Bs%3A%20i%20for%20i%2C%20s%20in%20enumerate(unique_scaffolds)%7D%0A%20%20%20%20%20%20%20%20groups%20%3D%20np.array(%5Bscaffold_to_id%5Bs%5D%20for%20s%20in%20scaffolds%5D)%0A%0A%20%20%20%20%20%20%20%20for%20i%20in%20range(n_outer)%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20kf%20%3D%20GroupKFoldShuffle(n_splits%3Dn_inner%2C%20random_state%3Dseed%20%2B%20i%2C%20shuffle%3DTrue)%0A%20%20%20%20%20%20%20%20%20%20%20%20for%20j%2C%20(train_idx%2C%20test_idx)%20in%20enumerate(kf.split(df%2C%20groups%3Dgroups))%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20fold%20%3D%20i%20*%20n_inner%20%2B%20j%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20train%20%3D%20df%5Btrain_idx%5D.clone()%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20test%20%3D%20df%5Btest_idx%5D.clone()%0A%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20val%20%3D%20None%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20if%20p_val%20%3E%200%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20train%2C%20val%20%3D%20split_dataset_random(train%2C%20p_test%3Dp_val%2C%20seed%3Dseed%20%2B%20fold)%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20yield%20fold%2C%20i%2C%20j%2C%20train%2C%20val%2C%20test%0A%0A%20%20%20%20return%20(generate_cv_splits_scaffold%2C)%0A%0A%0A%40app.cell%0Adef%20_(Iterator%2C%20np%2C%20pl%2C%20split_dataset_random)%3A%0A%20%20%20%20def%20_extract_molecule_number(name%3A%20str)%20-%3E%20int%3A%0A%20%20%20%20%20%20%20%20%22%22%22%0A%20%20%20%20%20%20%20%20Extract%20the%20numeric%20portion%20from%20a%20molecule%20name%20used%20as%20a%20temporal%20proxy.%0A%0A%20%20%20%20%20%20%20%20The%20molecule_names%20column%20contains%20identifiers%20such%20as%20%22OADMET-0003144%22%2C%0A%20%20%20%20%20%20%20%20where%20the%20numeric%20field%20after%20the%20dash%20encodes%20acquisition%20order.%0A%20%20%20%20%20%20%20%20Molecules%20with%20an%20empty%20or%20unparsable%20name%20return%200%20and%20are%20sorted%0A%20%20%20%20%20%20%20%20to%20the%20earliest%20positions.%0A%0A%20%20%20%20%20%20%20%20Args%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20name%3A%20Molecule%20identifier%20string%20(e.g.%20%22OADMET-0003144%22).%0A%0A%20%20%20%20%20%20%20%20Returns%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20Integer%20index%20representing%20acquisition%20order.%0A%20%20%20%20%20%20%20%20%22%22%22%0A%20%20%20%20%20%20%20%20try%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20return%20int(name.split(%22-%22)%5B1%5D)%0A%20%20%20%20%20%20%20%20except%20(IndexError%2C%20ValueError)%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20return%200%0A%0A%20%20%20%20def%20generate_cv_splits_temporal(%0A%20%20%20%20%20%20%20%20df%3A%20pl.DataFrame%2C%0A%20%20%20%20%20%20%20%20n_folds%3A%20int%20%3D%205%2C%0A%20%20%20%20%20%20%20%20seed%3A%20int%20%3D%2042%2C%0A%20%20%20%20%20%20%20%20p_val%3A%20float%20%3D%200%2C%0A%20%20%20%20)%20-%3E%20Iterator%3A%0A%20%20%20%20%20%20%20%20%22%22%22%0A%20%20%20%20%20%20%20%20Generate%20CV%20splits%20by%20chunking%20molecules%20in%20temporal%20order.%0A%0A%20%20%20%20%20%20%20%20Molecules%20are%20sorted%20by%20the%20numeric%20component%20of%20their%20molecule_names%20value%0A%20%20%20%20%20%20%20%20(e.g.%20%220003144%22%20in%20%22OADMET-0003144%22)%2C%20which%20is%20assumed%20to%20reflect%20acquisition%0A%20%20%20%20%20%20%20%20order.%20The%20sorted%20dataset%20is%20divided%20into%20n_folds%20equal-ish%20chunks.%20Each%20fold%0A%20%20%20%20%20%20%20%20uses%20one%20chunk%20as%20the%20test%20set%20and%20the%20concatenation%20of%20all%20remaining%20chunks%0A%20%20%20%20%20%20%20%20as%20the%20training%20set.%0A%0A%20%20%20%20%20%20%20%20Note%3A%20this%20differs%20from%20a%20strict%20walk-forward%20split%20%E2%80%94%20training%20data%20for%20a%0A%20%20%20%20%20%20%20%20given%20fold%20may%20include%20molecules%20acquired%20*after*%20the%20test%20chunk.%20The%20intent%0A%20%20%20%20%20%20%20%20is%20purely%20to%20use%20the%20numeric%20ID%20as%20a%20proxy%20for%20diversity%2Fbatches%20rather%20than%0A%20%20%20%20%20%20%20%20to%20enforce%20a%20hard%20temporal%20boundary.%0A%0A%20%20%20%20%20%20%20%20Molecules%20with%20empty%20or%20unparsable%20names%20are%20assigned%20order%200%20and%20sort%0A%20%20%20%20%20%20%20%20to%20the%20beginning%20of%20the%20sequence%20(treated%20as%20the%20earliest%20acquired).%0A%0A%20%20%20%20%20%20%20%20Args%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20df%3A%20Polars%20DataFrame%20containing%20a%20%22molecule_names%22%20column.%0A%20%20%20%20%20%20%20%20%20%20%20%20n_folds%3A%20Number%20of%20folds%20%2F%20chunks.%0A%20%20%20%20%20%20%20%20%20%20%20%20seed%3A%20Random%20seed%20used%20only%20for%20the%20optional%20validation%20split.%0A%20%20%20%20%20%20%20%20%20%20%20%20p_val%3A%20Fraction%20of%20the%20training%20set%20reserved%20as%20a%20validation%20split%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20(sampled%20randomly).%0A%0A%20%20%20%20%20%20%20%20Yields%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20Tuples%20of%20(fold_index%2C%20train_df%2C%20val_df%2C%20test_df).%0A%20%20%20%20%20%20%20%20%20%20%20%20val_df%20is%20None%20when%20p_val%20%3D%3D%200.%0A%20%20%20%20%20%20%20%20%22%22%22%0A%20%20%20%20%20%20%20%20%23%20Sort%20molecules%20by%20numeric%20ID%20so%20earlier%20IDs%20appear%20first.%0A%20%20%20%20%20%20%20%20mol_numbers%20%3D%20np.array(%0A%20%20%20%20%20%20%20%20%20%20%20%20%5B_extract_molecule_number(n)%20for%20n%20in%20df%5B%22molecule_names%22%5D.to_list()%5D%0A%20%20%20%20%20%20%20%20)%0A%20%20%20%20%20%20%20%20sorted_idx%20%3D%20np.argsort(mol_numbers%2C%20kind%3D%22stable%22)%0A%20%20%20%20%20%20%20%20df_sorted%20%3D%20df%5Bsorted_idx%5D.clone()%0A%20%20%20%20%20%20%20%20n%20%3D%20df_sorted.shape%5B0%5D%0A%0A%20%20%20%20%20%20%20%20%23%20Cut%20the%20sorted%20data%20into%20n_folds%20equal-ish%20chunks.%0A%20%20%20%20%20%20%20%20boundaries%20%3D%20np.linspace(0%2C%20n%2C%20n_folds%20%2B%201%2C%20dtype%3Dint)%0A%20%20%20%20%20%20%20%20chunks%20%3D%20%5B%0A%20%20%20%20%20%20%20%20%20%20%20%20df_sorted%5Bboundaries%5Bi%5D%3Aboundaries%5Bi%20%2B%201%5D%5D.clone()%0A%20%20%20%20%20%20%20%20%20%20%20%20for%20i%20in%20range(n_folds)%0A%20%20%20%20%20%20%20%20%5D%0A%0A%20%20%20%20%20%20%20%20for%20fold%20in%20range(n_folds)%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20test%20%3D%20chunks%5Bfold%5D%0A%20%20%20%20%20%20%20%20%20%20%20%20train%20%3D%20pl.concat(%5Bchunks%5Bi%5D%20for%20i%20in%20range(n_folds)%20if%20i%20!%3D%20fold%5D)%0A%0A%20%20%20%20%20%20%20%20%20%20%20%20val%20%3D%20None%0A%20%20%20%20%20%20%20%20%20%20%20%20if%20p_val%20%3E%200%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20train%2C%20val%20%3D%20split_dataset_random(train%2C%20p_test%3Dp_val%2C%20seed%3Dseed%20%2B%20fold)%0A%0A%20%20%20%20%20%20%20%20%20%20%20%20yield%20fold%2C%20train%2C%20val%2C%20test%0A%0A%20%20%20%20return%20(generate_cv_splits_temporal%2C)%0A%0A%0A%40app.cell%0Adef%20_(mo)%3A%0A%20%20%20%20mo.md(r%22%22%22%0A%20%20%20%20%23%23%20ML%20comparison%20code%0A%0A%20%20%20%20Adapted%20from%20https%3A%2F%2Fgithub.com%2Fpolaris-hub%2Fpolaris-method-comparison%0A%20%20%20%20%22%22%22)%0A%20%20%20%20return%0A%0A%0A%40app.cell%0Adef%20_(%0A%20%20%20%20mean_absolute_error%2C%0A%20%20%20%20mean_squared_error%2C%0A%20%20%20%20pl%2C%0A%20%20%20%20precision_score%2C%0A%20%20%20%20r2_score%2C%0A%20%20%20%20recall_score%2C%0A%20%20%20%20spearmanr%2C%0A%20%20%20%20warnings%2C%0A)%3A%0A%20%20%20%20def%20calc_regression_metrics(%0A%20%20%20%20%20%20%20%20df%3A%20pl.DataFrame%2C%0A%20%20%20%20%20%20%20%20cycle_col%3A%20str%2C%0A%20%20%20%20%20%20%20%20val_col%3A%20str%2C%0A%20%20%20%20%20%20%20%20pred_col%3A%20str%2C%0A%20%20%20%20%20%20%20%20thresh%3A%20float%2C%0A%20%20%20%20)%20-%3E%20pl.DataFrame%3A%0A%20%20%20%20%20%20%20%20%22%22%22%0A%20%20%20%20%20%20%20%20Calculate%20regression%20metrics%20(MAE%2C%20MSE%2C%20R2%2C%20rho%2C%20prec%2C%20recall)%20for%20each%20method%20and%20split.%0A%0A%20%20%20%20%20%20%20%20Args%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20df%3A%20Polars%20DataFrame%20with%20columns%20%5Bmethod%2C%20split%5D%20plus%20the%20columns%20named%20in%20the%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20remaining%20arguments.%0A%20%20%20%20%20%20%20%20%20%20%20%20cycle_col%3A%20Column%20indicating%20the%20cross-validation%20fold.%0A%20%20%20%20%20%20%20%20%20%20%20%20val_col%3A%20Column%20with%20the%20ground%20truth%20values.%0A%20%20%20%20%20%20%20%20%20%20%20%20pred_col%3A%20Column%20with%20the%20model%20predictions.%0A%20%20%20%20%20%20%20%20%20%20%20%20thresh%3A%20Decision%20threshold%20used%20to%20binarise%20continuous%20values%20for%20precision%2Frecall.%0A%0A%20%20%20%20%20%20%20%20Returns%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20Polars%20DataFrame%20with%20columns%20%5Bcv_cycle%2C%20method%2C%20split%2C%20mae%2C%20mse%2C%20r2%2C%20rho%2C%20prec%2C%20recall%5D.%0A%20%20%20%20%20%20%20%20%22%22%22%0A%20%20%20%20%20%20%20%20%23%20Derive%20binary%20class%20columns%20from%20the%20continuous%20threshold%0A%20%20%20%20%20%20%20%20df_in%20%3D%20df.with_columns(%5B%0A%20%20%20%20%20%20%20%20%20%20%20%20(pl.col(val_col)%20%3E%20thresh).alias(%22true_class%22)%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20(pl.col(pred_col)%20%3E%20thresh).alias(%22pred_class%22)%2C%0A%20%20%20%20%20%20%20%20%5D)%0A%0A%20%20%20%20%20%20%20%20%23%20Ensure%20the%20threshold%20actually%20produces%20two%20distinct%20classes%0A%20%20%20%20%20%20%20%20assert%20df_in%5B%22true_class%22%5D.n_unique()%20%3D%3D%202%2C%20%22Binary%20classification%20requires%20two%20classes%22%0A%0A%20%20%20%20%20%20%20%20metric_list%3A%20list%5Bdict%5D%20%3D%20%5B%5D%0A%0A%20%20%20%20%20%20%20%20%23%20Iterate%20over%20each%20(cycle%2C%20method%2C%20split)%20group%20and%20compute%20metrics%0A%20%20%20%20%20%20%20%20for%20group_keys%2C%20group_df%20in%20df_in.group_by(%5Bcycle_col%2C%20%22method%22%2C%20%22split%22%5D)%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20cycle%2C%20method%2C%20split%20%3D%20group_keys%0A%20%20%20%20%20%20%20%20%20%20%20%20y_true%20%3D%20group_df%5Bval_col%5D.to_numpy()%0A%20%20%20%20%20%20%20%20%20%20%20%20y_pred%20%3D%20group_df%5Bpred_col%5D.to_numpy()%0A%20%20%20%20%20%20%20%20%20%20%20%20y_true_cls%20%3D%20group_df%5B%22true_class%22%5D.to_numpy()%0A%20%20%20%20%20%20%20%20%20%20%20%20y_pred_cls%20%3D%20group_df%5B%22pred_class%22%5D.to_numpy()%0A%0A%20%20%20%20%20%20%20%20%20%20%20%20with%20warnings.catch_warnings()%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20warnings.simplefilter(%22ignore%22)%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20rho%2C%20_%20%3D%20spearmanr(y_true%2C%20y_pred)%0A%20%20%20%20%20%20%20%20%20%20%20%20metric_list.append(%7B%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%22cv_cycle%22%3A%20cycle%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%22method%22%3A%20method%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%22split%22%3A%20split%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%22mae%22%3A%20mean_absolute_error(y_true%2C%20y_pred)%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%22mse%22%3A%20mean_squared_error(y_true%2C%20y_pred)%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%22r2%22%3A%20r2_score(y_true%2C%20y_pred)%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%22rho%22%3A%20float(rho)%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%22prec%22%3A%20precision_score(y_true_cls%2C%20y_pred_cls)%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%22recall%22%3A%20recall_score(y_true_cls%2C%20y_pred_cls)%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20%7D)%0A%0A%20%20%20%20%20%20%20%20return%20pl.DataFrame(metric_list)%0A%0A%20%20%20%20return%20(calc_regression_metrics%2C)%0A%0A%0A%40app.cell%0Adef%20_(Optional%2C%20np%2C%20pd%2C%20pg%2C%20pl%2C%20psturng%2C%20qsturng%2C%20warnings)%3A%0A%20%20%20%20def%20rm_tukey_hsd(%0A%20%20%20%20%20%20%20%20df%3A%20pl.DataFrame%2C%0A%20%20%20%20%20%20%20%20metric%3A%20str%2C%0A%20%20%20%20%20%20%20%20group_col%3A%20str%2C%0A%20%20%20%20%20%20%20%20alpha%3A%20float%20%3D%200.05%2C%0A%20%20%20%20%20%20%20%20sort%3A%20bool%20%3D%20False%2C%0A%20%20%20%20%20%20%20%20direction_dict%3A%20Optional%5Bdict%5D%20%3D%20None%2C%0A%20%20%20%20)%20-%3E%20tuple%3A%0A%20%20%20%20%20%20%20%20%22%22%22%0A%20%20%20%20%20%20%20%20Perform%20repeated%20measures%20Tukey%20HSD%20test%20on%20the%20given%20Polars%20DataFrame.%0A%0A%20%20%20%20%20%20%20%20Internally%20converts%20to%20pandas%20for%20pingouin%2Fstatsmodels%20compatibility.%0A%20%20%20%20%20%20%20%20All%20returned%20DataFrames%20are%20pandas%20objects%20for%20downstream%20seaborn%20plotting.%0A%0A%20%20%20%20%20%20%20%20Args%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20df%3A%20Polars%20DataFrame%20with%20columns%20%5Bcv_cycle%2C%20group_col%2C%20metric%5D.%0A%20%20%20%20%20%20%20%20%20%20%20%20metric%3A%20Column%20name%20of%20the%20metric%20to%20test.%0A%20%20%20%20%20%20%20%20%20%20%20%20group_col%3A%20Column%20name%20indicating%20the%20comparison%20groups.%0A%20%20%20%20%20%20%20%20%20%20%20%20alpha%3A%20Significance%20level%20for%20the%20test.%0A%20%20%20%20%20%20%20%20%20%20%20%20sort%3A%20Whether%20to%20sort%20groups%20by%20their%20mean%20metric%20value.%0A%20%20%20%20%20%20%20%20%20%20%20%20direction_dict%3A%20Maps%20metric%20names%20to%20%22maximize%22%20or%20%22minimize%22%20for%20sort%20direction.%0A%0A%20%20%20%20%20%20%20%20Returns%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20Tuple%20of%20(result_tab%2C%20df_means%2C%20df_means_diff%2C%20pc)%20%E2%80%94%20all%20pandas%20DataFrames.%0A%20%20%20%20%20%20%20%20%20%20%20%20-%20result_tab%3A%20Pairwise%20comparisons%20with%20adjusted%20p-values.%0A%20%20%20%20%20%20%20%20%20%20%20%20-%20df_means%3A%20Mean%20values%20per%20group.%0A%20%20%20%20%20%20%20%20%20%20%20%20-%20df_means_diff%3A%20Matrix%20of%20pairwise%20mean%20differences.%0A%20%20%20%20%20%20%20%20%20%20%20%20-%20pc%3A%20Matrix%20of%20adjusted%20p-values.%0A%20%20%20%20%20%20%20%20%22%22%22%0A%20%20%20%20%20%20%20%20%23%20Convert%20to%20pandas%20%E2%80%94%20pingouin%20and%20statsmodels%20require%20it%0A%20%20%20%20%20%20%20%20df_pd%20%3D%20df.to_pandas()%0A%0A%20%20%20%20%20%20%20%20if%20sort%20and%20direction_dict%20and%20metric%20in%20direction_dict%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20if%20direction_dict%5Bmetric%5D%20%3D%3D%20'maximize'%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20df_means%20%3D%20df_pd.groupby(group_col).mean(numeric_only%3DTrue).sort_values(metric%2C%20ascending%3DFalse)%0A%20%20%20%20%20%20%20%20%20%20%20%20elif%20direction_dict%5Bmetric%5D%20%3D%3D%20'minimize'%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20df_means%20%3D%20df_pd.groupby(group_col).mean(numeric_only%3DTrue).sort_values(metric%2C%20ascending%3DTrue)%0A%20%20%20%20%20%20%20%20%20%20%20%20else%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20raise%20ValueError(%22Invalid%20direction.%20Expected%20'maximize'%20or%20'minimize'.%22)%0A%20%20%20%20%20%20%20%20else%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20df_means%20%3D%20df_pd.groupby(group_col).mean(numeric_only%3DTrue)%0A%0A%20%20%20%20%20%20%20%20with%20warnings.catch_warnings()%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20warnings.filterwarnings('ignore'%2C%20category%3DRuntimeWarning%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20message%3D'divide%20by%20zero%20encountered%20in%20scalar%20divide')%0A%20%20%20%20%20%20%20%20%20%20%20%20aov%20%3D%20pg.rm_anova(dv%3Dmetric%2C%20within%3Dgroup_col%2C%20subject%3D'cv_cycle'%2C%20data%3Ddf_pd%2C%20detailed%3DTrue)%0A%20%20%20%20%20%20%20%20mse%20%3D%20aov.loc%5B1%2C%20'MS'%5D%0A%20%20%20%20%20%20%20%20df_resid%20%3D%20aov.loc%5B1%2C%20'DF'%5D%0A%0A%20%20%20%20%20%20%20%20methods%20%3D%20df_means.index%0A%20%20%20%20%20%20%20%20n_groups%20%3D%20len(methods)%0A%20%20%20%20%20%20%20%20n_per_group%20%3D%20df_pd%5Bgroup_col%5D.value_counts().mean()%0A%0A%20%20%20%20%20%20%20%20tukey_se%20%3D%20np.sqrt(2%20*%20mse%20%2F%20n_per_group)%0A%20%20%20%20%20%20%20%20q%20%3D%20qsturng(1%20-%20alpha%2C%20n_groups%2C%20df_resid)%0A%0A%20%20%20%20%20%20%20%20num_comparisons%20%3D%20len(methods)%20*%20(len(methods)%20-%201)%20%2F%2F%202%0A%20%20%20%20%20%20%20%20result_tab%20%3D%20pd.DataFrame(index%3Drange(num_comparisons)%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20columns%3D%5B%22group1%22%2C%20%22group2%22%2C%20%22meandiff%22%2C%20%22lower%22%2C%20%22upper%22%2C%20%22p-adj%22%5D)%0A%0A%20%20%20%20%20%20%20%20df_means_diff%20%3D%20pd.DataFrame(index%3Dmethods%2C%20columns%3Dmethods%2C%20data%3D0.0)%0A%20%20%20%20%20%20%20%20pc%20%3D%20pd.DataFrame(index%3Dmethods%2C%20columns%3Dmethods%2C%20data%3D1.0)%0A%0A%20%20%20%20%20%20%20%20%23%20Calculate%20pairwise%20mean%20differences%20and%20adjusted%20p-values%0A%20%20%20%20%20%20%20%20row_idx%20%3D%200%0A%20%20%20%20%20%20%20%20for%20i%2C%20method1%20in%20enumerate(methods)%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20for%20j%2C%20method2%20in%20enumerate(methods)%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20if%20i%20%3C%20j%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20group1%20%3D%20df_pd%5Bdf_pd%5Bgroup_col%5D%20%3D%3D%20method1%5D%5Bmetric%5D%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20group2%20%3D%20df_pd%5Bdf_pd%5Bgroup_col%5D%20%3D%3D%20method2%5D%5Bmetric%5D%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20mean_diff%20%3D%20group1.mean()%20-%20group2.mean()%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20studentized_range%20%3D%20np.abs(mean_diff)%20%2F%20tukey_se%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20adjusted_p%20%3D%20psturng(studentized_range%20*%20np.sqrt(2)%2C%20n_groups%2C%20df_resid)%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20if%20isinstance(adjusted_p%2C%20np.ndarray)%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20adjusted_p%20%3D%20adjusted_p%5B0%5D%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20lower%20%3D%20mean_diff%20-%20(q%20%2F%20np.sqrt(2)%20*%20tukey_se)%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20upper%20%3D%20mean_diff%20%2B%20(q%20%2F%20np.sqrt(2)%20*%20tukey_se)%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20result_tab.loc%5Brow_idx%5D%20%3D%20%5Bmethod1%2C%20method2%2C%20mean_diff%2C%20lower%2C%20upper%2C%20adjusted_p%5D%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20pc.loc%5Bmethod1%2C%20method2%5D%20%3D%20adjusted_p%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20pc.loc%5Bmethod2%2C%20method1%5D%20%3D%20adjusted_p%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20df_means_diff.loc%5Bmethod1%2C%20method2%5D%20%3D%20mean_diff%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20df_means_diff.loc%5Bmethod2%2C%20method1%5D%20%3D%20-mean_diff%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20row_idx%20%2B%3D%201%0A%0A%20%20%20%20%20%20%20%20df_means_diff%20%3D%20df_means_diff.astype(float)%0A%0A%20%20%20%20%20%20%20%20result_tab%5B%22group1_mean%22%5D%20%3D%20result_tab%5B%22group1%22%5D.map(df_means%5Bmetric%5D)%0A%20%20%20%20%20%20%20%20result_tab%5B%22group2_mean%22%5D%20%3D%20result_tab%5B%22group2%22%5D.map(df_means%5Bmetric%5D)%0A%0A%20%20%20%20%20%20%20%20result_tab.index%20%3D%20result_tab%5B'group1'%5D%20%2B%20'%20-%20'%20%2B%20result_tab%5B'group2'%5D%0A%0A%20%20%20%20%20%20%20%20return%20result_tab%2C%20df_means%2C%20df_means_diff%2C%20pc%0A%0A%20%20%20%20return%20(rm_tukey_hsd%2C)%0A%0A%0A%40app.cell%0Adef%20_(AnovaRM%2C%20Optional%2C%20Path%2C%20pg%2C%20pl%2C%20plt%2C%20sns)%3A%0A%20%20%20%20def%20make_boxplots_parametric(%0A%20%20%20%20%20%20%20%20df%3A%20pl.DataFrame%2C%0A%20%20%20%20%20%20%20%20metric_ls%3A%20list%5Bstr%5D%2C%0A%20%20%20%20%20%20%20%20save_path%3A%20Optional%5BPath%5D%20%3D%20None%2C%0A%20%20%20%20)%20-%3E%20plt.Figure%3A%0A%20%20%20%20%20%20%20%20%22%22%22%0A%20%20%20%20%20%20%20%20Create%20boxplots%20for%20each%20metric%20using%20repeated%20measures%20ANOVA.%0A%0A%20%20%20%20%20%20%20%20Converts%20to%20pandas%20internally%20because%20statsmodels%20AnovaRM%20and%20seaborn%0A%20%20%20%20%20%20%20%20require%20pandas%20DataFrames.%0A%0A%20%20%20%20%20%20%20%20Args%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20df%3A%20Polars%20DataFrame%20with%20columns%20%5Bcv_cycle%2C%20method%5D%20plus%20metric%20columns.%0A%20%20%20%20%20%20%20%20%20%20%20%20metric_ls%3A%20List%20of%20metric%20column%20names%20to%20create%20boxplots%20for.%0A%20%20%20%20%20%20%20%20%20%20%20%20save_path%3A%20If%20provided%2C%20the%20figure%20is%20saved%20to%20this%20path%20before%20returning.%0A%0A%20%20%20%20%20%20%20%20Returns%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20Matplotlib%20Figure.%0A%20%20%20%20%20%20%20%20%22%22%22%0A%20%20%20%20%20%20%20%20%23%20AnovaRM%20and%20seaborn%20both%20require%20pandas%0A%20%20%20%20%20%20%20%20df_pd%20%3D%20df.to_pandas()%0A%0A%20%20%20%20%20%20%20%20sns.set_context('notebook')%0A%20%20%20%20%20%20%20%20sns.set(rc%3D%7B'figure.figsize'%3A%20(4%2C%203)%7D%2C%20font_scale%3D1.5)%0A%20%20%20%20%20%20%20%20sns.set_style('whitegrid')%0A%20%20%20%20%20%20%20%20figure%2C%20axes%20%3D%20plt.subplots(2%2C%202%2C%20sharex%3DFalse%2C%20sharey%3DFalse%2C%20figsize%3D(14%2C%2010))%0A%0A%20%20%20%20%20%20%20%20for%20i%2C%20stat%20in%20enumerate(metric_ls)%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20model%20%3D%20AnovaRM(data%3Ddf_pd%2C%20depvar%3Dstat%2C%20subject%3D'cv_cycle'%2C%20within%3D%5B'method'%5D).fit()%0A%20%20%20%20%20%20%20%20%20%20%20%20p_value%20%3D%20model.anova_table%5B'Pr%20%3E%20F'%5D.iloc%5B0%5D%0A%20%20%20%20%20%20%20%20%20%20%20%20ax%20%3D%20sns.boxplot(y%3Dstat%2C%20x%3D%22method%22%2C%20hue%3D%22method%22%2C%20ax%3Daxes%5Bi%20%2F%2F%202%2C%20i%20%25%202%5D%2C%20data%3Ddf_pd%2C%20palette%3D%22Set2%22%2C%20legend%3DFalse)%0A%20%20%20%20%20%20%20%20%20%20%20%20title%20%3D%20stat.upper()%0A%20%20%20%20%20%20%20%20%20%20%20%20ax.set_title(f%22p%3D%7Bp_value%3A.1e%7D%22)%0A%20%20%20%20%20%20%20%20%20%20%20%20ax.set_xlabel(%22%22)%0A%20%20%20%20%20%20%20%20%20%20%20%20ax.set_ylabel(title)%0A%20%20%20%20%20%20%20%20%20%20%20%20x_tick_labels%20%3D%20ax.get_xticklabels()%0A%20%20%20%20%20%20%20%20%20%20%20%20label_text_list%20%3D%20%5Bx.get_text()%20for%20x%20in%20x_tick_labels%5D%0A%20%20%20%20%20%20%20%20%20%20%20%20new_xtick_labels%20%3D%20%5B%22%5Cn%22.join(x.split(%22_%22))%20for%20x%20in%20label_text_list%5D%0A%20%20%20%20%20%20%20%20%20%20%20%20ax.set_xticks(list(range(0%2C%20len(x_tick_labels))))%0A%20%20%20%20%20%20%20%20%20%20%20%20ax.set_xticklabels(new_xtick_labels)%0A%20%20%20%20%20%20%20%20figure.tight_layout()%0A%20%20%20%20%20%20%20%20if%20save_path%20is%20not%20None%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20figure.savefig(save_path%2C%20dpi%3D300%2C%20bbox_inches%3D%22tight%22)%0A%20%20%20%20%20%20%20%20return%20figure%0A%0A%20%20%20%20def%20make_boxplots_nonparametric(%0A%20%20%20%20%20%20%20%20df%3A%20pl.DataFrame%2C%0A%20%20%20%20%20%20%20%20metric_ls%3A%20list%5Bstr%5D%2C%0A%20%20%20%20%20%20%20%20save_path%3A%20Optional%5BPath%5D%20%3D%20None%2C%0A%20%20%20%20)%20-%3E%20plt.Figure%3A%0A%20%20%20%20%20%20%20%20%22%22%22%0A%20%20%20%20%20%20%20%20Create%20boxplots%20for%20each%20metric%20using%20the%20Friedman%20non-parametric%20test.%0A%0A%20%20%20%20%20%20%20%20Converts%20to%20pandas%20internally%20because%20pingouin%20and%20seaborn%20require%20pandas.%0A%0A%20%20%20%20%20%20%20%20Args%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20df%3A%20Polars%20DataFrame%20with%20columns%20%5Bcv_cycle%2C%20method%5D%20plus%20metric%20columns.%0A%20%20%20%20%20%20%20%20%20%20%20%20metric_ls%3A%20List%20of%20metric%20column%20names%20to%20create%20boxplots%20for.%0A%20%20%20%20%20%20%20%20%20%20%20%20save_path%3A%20If%20provided%2C%20the%20figure%20is%20saved%20to%20this%20path%20before%20returning.%0A%0A%20%20%20%20%20%20%20%20Returns%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20Matplotlib%20Figure.%0A%20%20%20%20%20%20%20%20%22%22%22%0A%20%20%20%20%20%20%20%20%23%20pingouin%20and%20seaborn%20both%20require%20pandas%0A%20%20%20%20%20%20%20%20df_pd%20%3D%20df.to_pandas()%0A%0A%20%20%20%20%20%20%20%20sns.set_context('notebook')%0A%20%20%20%20%20%20%20%20sns.set(rc%3D%7B'figure.figsize'%3A%20(4%2C%203)%7D%2C%20font_scale%3D1.5)%0A%20%20%20%20%20%20%20%20sns.set_style('whitegrid')%0A%20%20%20%20%20%20%20%20figure%2C%20axes%20%3D%20plt.subplots(2%2C%202%2C%20sharex%3DFalse%2C%20sharey%3DFalse%2C%20figsize%3D(14%2C%2010))%0A%0A%20%20%20%20%20%20%20%20for%20i%2C%20stat%20in%20enumerate(metric_ls)%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20friedman%20%3D%20pg.friedman(df_pd%2C%20dv%3Dstat%2C%20within%3D%22method%22%2C%20subject%3D%22cv_cycle%22)%5B'p_unc'%5D.values%5B0%5D%0A%20%20%20%20%20%20%20%20%20%20%20%20ax%20%3D%20sns.boxplot(y%3Dstat%2C%20x%3D%22method%22%2C%20hue%3D%22method%22%2C%20ax%3Daxes%5Bi%20%2F%2F%202%2C%20i%20%25%202%5D%2C%20data%3Ddf_pd%2C%20palette%3D%22Set2%22%2C%20legend%3DFalse)%0A%20%20%20%20%20%20%20%20%20%20%20%20title%20%3D%20stat.replace(%22_%22%2C%20%22%20%22).upper()%0A%20%20%20%20%20%20%20%20%20%20%20%20ax.set_title(f%22p%3D%7Bfriedman%3A.1e%7D%22)%0A%20%20%20%20%20%20%20%20%20%20%20%20ax.set_xlabel(%22%22)%0A%20%20%20%20%20%20%20%20%20%20%20%20ax.set_ylabel(title)%0A%20%20%20%20%20%20%20%20%20%20%20%20x_tick_labels%20%3D%20ax.get_xticklabels()%0A%20%20%20%20%20%20%20%20%20%20%20%20label_text_list%20%3D%20%5Bx.get_text()%20for%20x%20in%20x_tick_labels%5D%0A%20%20%20%20%20%20%20%20%20%20%20%20new_xtick_labels%20%3D%20%5B%22%5Cn%22.join(x.split(%22_%22))%20for%20x%20in%20label_text_list%5D%0A%20%20%20%20%20%20%20%20%20%20%20%20ax.set_xticks(list(range(0%2C%20len(x_tick_labels))))%0A%20%20%20%20%20%20%20%20%20%20%20%20ax.set_xticklabels(new_xtick_labels)%0A%20%20%20%20%20%20%20%20figure.tight_layout()%0A%20%20%20%20%20%20%20%20if%20save_path%20is%20not%20None%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20figure.savefig(save_path%2C%20dpi%3D300%2C%20bbox_inches%3D%22tight%22)%0A%20%20%20%20%20%20%20%20return%20figure%0A%0A%20%20%20%20return%20make_boxplots_nonparametric%2C%20make_boxplots_parametric%0A%0A%0A%40app.cell%0Adef%20_(Optional%2C%20Path%2C%20math%2C%20np%2C%20pl%2C%20plt%2C%20rm_tukey_hsd%2C%20sns%2C%20stats)%3A%0A%20%20%20%20def%20make_normality_diagnostic(%0A%20%20%20%20%20%20%20%20df%3A%20pl.DataFrame%2C%0A%20%20%20%20%20%20%20%20metric_ls%3A%20list%5Bstr%5D%2C%0A%20%20%20%20%20%20%20%20save_path%3A%20Optional%5BPath%5D%20%3D%20None%2C%0A%20%20%20%20)%20-%3E%20plt.Figure%3A%0A%20%20%20%20%20%20%20%20%22%22%22%0A%20%20%20%20%20%20%20%20Create%20a%20normality%20diagnostic%20plot%20grid%20with%20histograms%20and%20QQ%20plots%20for%20the%20given%20metrics.%0A%0A%20%20%20%20%20%20%20%20Residuals%20are%20computed%20by%20subtracting%20each%20group's%20mean%20(per%20method)%20so%20that%0A%20%20%20%20%20%20%20%20the%20normality%20assumption%20of%20the%20repeated-measures%20ANOVA%20can%20be%20assessed.%0A%0A%20%20%20%20%20%20%20%20Args%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20df%3A%20Polars%20DataFrame%20with%20columns%20%5Bcv_cycle%2C%20method%2C%20split%5D%20plus%20metric%20columns.%0A%20%20%20%20%20%20%20%20%20%20%20%20metric_ls%3A%20List%20of%20metric%20column%20names%20to%20assess%20for%20normality.%0A%20%20%20%20%20%20%20%20%20%20%20%20save_path%3A%20If%20provided%2C%20the%20figure%20is%20saved%20to%20this%20path%20before%20returning.%0A%0A%20%20%20%20%20%20%20%20Returns%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20Matplotlib%20Figure.%0A%20%20%20%20%20%20%20%20%22%22%22%0A%20%20%20%20%20%20%20%20%23%20Subtract%20per-method%20group%20mean%20from%20each%20metric%20(mean-centre%20within%20method)%0A%20%20%20%20%20%20%20%20group_means%20%3D%20df.group_by(%22method%22).agg(%5B%0A%20%20%20%20%20%20%20%20%20%20%20%20pl.col(m).mean().alias(f%22_mean_%7Bm%7D%22)%20for%20m%20in%20metric_ls%0A%20%20%20%20%20%20%20%20%5D)%0A%20%20%20%20%20%20%20%20df_norm%20%3D%20df.join(group_means%2C%20on%3D%22method%22%2C%20how%3D%22left%22)%0A%20%20%20%20%20%20%20%20df_norm%20%3D%20df_norm.with_columns(%5B%0A%20%20%20%20%20%20%20%20%20%20%20%20(pl.col(m)%20-%20pl.col(f%22_mean_%7Bm%7D%22)).alias(m)%20for%20m%20in%20metric_ls%0A%20%20%20%20%20%20%20%20%5D).drop(%5Bf%22_mean_%7Bm%7D%22%20for%20m%20in%20metric_ls%5D)%0A%0A%20%20%20%20%20%20%20%20%23%20Unpivot%20(melt)%20to%20long%20format%20for%20easy%20per-metric%20iteration%0A%20%20%20%20%20%20%20%20df_long%20%3D%20df_norm.unpivot(%0A%20%20%20%20%20%20%20%20%20%20%20%20on%3Dmetric_ls%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20index%3D%5B%22cv_cycle%22%2C%20%22method%22%2C%20%22split%22%5D%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20variable_name%3D%22metric%22%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20value_name%3D%22value%22%2C%0A%20%20%20%20%20%20%20%20)%0A%0A%20%20%20%20%20%20%20%20%23%20Convert%20to%20pandas%20for%20seaborn%20and%20scipy.stats.probplot%0A%20%20%20%20%20%20%20%20df_long_pd%20%3D%20df_long.to_pandas()%0A%0A%20%20%20%20%20%20%20%20sns.set_context('notebook'%2C%20font_scale%3D1.5)%0A%20%20%20%20%20%20%20%20sns.set_style('whitegrid')%0A%0A%20%20%20%20%20%20%20%20metrics%20%3D%20df_long_pd%5B'metric'%5D.unique()%0A%20%20%20%20%20%20%20%20n_metrics%20%3D%20len(metrics)%0A%0A%20%20%20%20%20%20%20%20fig%2C%20axes%20%3D%20plt.subplots(2%2C%20n_metrics%2C%20figsize%3D(20%2C%2010))%0A%0A%20%20%20%20%20%20%20%20for%20i%2C%20metric%20in%20enumerate(metrics)%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20ax%20%3D%20axes%5B0%2C%20i%5D%0A%20%20%20%20%20%20%20%20%20%20%20%20sns.histplot(df_long_pd%5Bdf_long_pd%5B'metric'%5D%20%3D%3D%20metric%5D%5B'value'%5D%2C%20kde%3DTrue%2C%20ax%3Dax)%0A%20%20%20%20%20%20%20%20%20%20%20%20ax.set_title(f'%7Bmetric%7D'%2C%20fontsize%3D16)%0A%0A%20%20%20%20%20%20%20%20for%20i%2C%20metric%20in%20enumerate(metrics)%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20ax%20%3D%20axes%5B1%2C%20i%5D%0A%20%20%20%20%20%20%20%20%20%20%20%20metric_data%20%3D%20df_long_pd%5Bdf_long_pd%5B'metric'%5D%20%3D%3D%20metric%5D%5B'value'%5D%0A%20%20%20%20%20%20%20%20%20%20%20%20stats.probplot(metric_data%2C%20dist%3D%22norm%22%2C%20plot%3Dax)%0A%20%20%20%20%20%20%20%20%20%20%20%20ax.set_title(%22%22)%0A%0A%20%20%20%20%20%20%20%20fig.tight_layout()%0A%20%20%20%20%20%20%20%20if%20save_path%20is%20not%20None%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20fig.savefig(save_path%2C%20dpi%3D300%2C%20bbox_inches%3D%22tight%22)%0A%20%20%20%20%20%20%20%20return%20fig%0A%0A%0A%20%20%20%20def%20mcs_plot(pc%2C%20effect_size%2C%20means%2C%20labels%3DTrue%2C%20cmap%3DNone%2C%20cbar_ax_bbox%3DNone%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20ax%3DNone%2C%20show_diff%3DTrue%2C%20cell_text_size%3D16%2C%20axis_text_size%3D12%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20show_cbar%3DTrue%2C%20reverse_cmap%3DFalse%2C%20vlim%3DNone%2C%20**kwargs)%3A%0A%20%20%20%20%20%20%20%20%22%22%22%0A%20%20%20%20%20%20%20%20Create%20a%20multiple%20comparison%20of%20means%20plot%20using%20a%20heatmap.%0A%0A%20%20%20%20%20%20%20%20Parameters%3A%0A%20%20%20%20%20%20%20%20pc%20(pd.DataFrame)%3A%20DataFrame%20containing%20p-values%20for%20pairwise%20comparisons.%0A%20%20%20%20%20%20%20%20effect_size%20(pd.DataFrame)%3A%20DataFrame%20containing%20effect%20sizes%20for%20pairwise%20comparisons.%0A%20%20%20%20%20%20%20%20means%20(pd.Series)%3A%20Series%20containing%20mean%20values%20for%20each%20group.%0A%20%20%20%20%20%20%20%20labels%20(bool)%3A%20Whether%20to%20show%20labels%20on%20the%20axes.%20Default%20is%20True.%0A%20%20%20%20%20%20%20%20cmap%20(str)%3A%20Colormap%20to%20use%20for%20the%20heatmap.%20Default%20is%20None.%0A%20%20%20%20%20%20%20%20cbar_ax_bbox%20(tuple)%3A%20Bounding%20box%20for%20the%20colorbar%20axis.%20Default%20is%20None.%0A%20%20%20%20%20%20%20%20ax%20(matplotlib.axes.Axes)%3A%20The%20axes%20on%20which%20to%20plot%20the%20heatmap.%20Default%20is%20None.%0A%20%20%20%20%20%20%20%20show_diff%20(bool)%3A%20Whether%20to%20show%20the%20mean%20differences%20in%20the%20plot.%20Default%20is%20True.%0A%20%20%20%20%20%20%20%20cell_text_size%20(int)%3A%20Font%20size%20for%20the%20cell%20text.%20Default%20is%2016.%0A%20%20%20%20%20%20%20%20axis_text_size%20(int)%3A%20Font%20size%20for%20the%20axis%20text.%20Default%20is%2012.%0A%20%20%20%20%20%20%20%20show_cbar%20(bool)%3A%20Whether%20to%20show%20the%20colorbar.%20Default%20is%20True.%0A%20%20%20%20%20%20%20%20reverse_cmap%20(bool)%3A%20Whether%20to%20reverse%20the%20colormap.%20Default%20is%20False.%0A%20%20%20%20%20%20%20%20vlim%20(float)%3A%20Limit%20for%20the%20colormap.%20Default%20is%20None.%0A%20%20%20%20%20%20%20%20**kwargs%3A%20Additional%20keyword%20arguments%20for%20the%20heatmap.%0A%0A%20%20%20%20%20%20%20%20Returns%3A%0A%20%20%20%20%20%20%20%20matplotlib.axes.Axes%3A%20The%20axes%20with%20the%20heatmap.%0A%20%20%20%20%20%20%20%20%22%22%22%0A%20%20%20%20%20%20%20%20for%20key%20in%20%5B'cbar'%2C%20'vmin'%2C%20'vmax'%2C%20'center'%5D%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20if%20key%20in%20kwargs%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20del%20kwargs%5Bkey%5D%0A%0A%20%20%20%20%20%20%20%20if%20not%20cmap%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20cmap%20%3D%20%22coolwarm%22%0A%20%20%20%20%20%20%20%20if%20reverse_cmap%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20cmap%20%3D%20cmap%20%2B%20%22_r%22%0A%0A%20%20%20%20%20%20%20%20significance%20%3D%20pc.copy().astype(object)%0A%20%20%20%20%20%20%20%20significance%5B(pc%20%3C%200.001)%20%26%20(pc%20%3E%3D%200)%5D%20%3D%20'***'%0A%20%20%20%20%20%20%20%20significance%5B(pc%20%3C%200.01)%20%26%20(pc%20%3E%3D%200.001)%5D%20%3D%20'**'%0A%20%20%20%20%20%20%20%20significance%5B(pc%20%3C%200.05)%20%26%20(pc%20%3E%3D%200.01)%5D%20%3D%20'*'%0A%20%20%20%20%20%20%20%20significance%5B(pc%20%3E%3D%200.05)%5D%20%3D%20''%0A%0A%20%20%20%20%20%20%20%20np.fill_diagonal(significance.values%2C%20'')%0A%0A%20%20%20%20%20%20%20%20%23%20Create%20a%20DataFrame%20for%20the%20annotations%0A%20%20%20%20%20%20%20%20if%20show_diff%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20annotations%20%3D%20effect_size.round(3).astype(str)%20%2B%20significance%0A%20%20%20%20%20%20%20%20else%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20annotations%20%3D%20significance%0A%0A%20%20%20%20%20%20%20%20hax%20%3D%20sns.heatmap(effect_size%2C%20cmap%3Dcmap%2C%20annot%3Dannotations%2C%20fmt%3D''%2C%20cbar%3Dshow_cbar%2C%20ax%3Dax%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20annot_kws%3D%7B%22size%22%3A%20cell_text_size%7D%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20vmin%3D-2*vlim%20if%20vlim%20else%20None%2C%20vmax%3D2*vlim%20if%20vlim%20else%20None%2C%20**kwargs)%0A%0A%20%20%20%20%20%20%20%20if%20labels%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20label_list%20%3D%20list(means.index)%0A%20%20%20%20%20%20%20%20%20%20%20%20x_label_list%20%3D%20%5Bx%20%2B%20f'%5Cn%7Bmeans.loc%5Bx%5D.round(2)%7D'%20for%20x%20in%20label_list%5D%0A%20%20%20%20%20%20%20%20%20%20%20%20y_label_list%20%3D%20%5Bx%20%2B%20f'%5Cn%7Bmeans.loc%5Bx%5D.round(2)%7D%5Cn'%20for%20x%20in%20label_list%5D%0A%20%20%20%20%20%20%20%20%20%20%20%20hax.set_xticklabels(x_label_list%2C%20size%3Daxis_text_size%2C%20ha%3D'center'%2C%20va%3D'top'%2C%20rotation%3D0%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20rotation_mode%3D'anchor')%0A%20%20%20%20%20%20%20%20%20%20%20%20hax.set_yticklabels(y_label_list%2C%20size%3Daxis_text_size%2C%20ha%3D'center'%2C%20va%3D'center'%2C%20rotation%3D90%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20rotation_mode%3D'anchor')%0A%0A%20%20%20%20%20%20%20%20hax.set_xlabel('')%0A%20%20%20%20%20%20%20%20hax.set_ylabel('')%0A%0A%20%20%20%20%20%20%20%20return%20hax%0A%0A%0A%20%20%20%20def%20make_mcs_plot_grid(df%2C%20stats%2C%20group_col%2C%20alpha%3D.05%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20figsize%3D(20%2C%2010)%2C%20direction_dict%3D%7B%7D%2C%20effect_dict%3D%7B%7D%2C%20show_diff%3DTrue%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20cell_text_size%3D16%2C%20axis_text_size%3D12%2C%20title_text_size%3D16%2C%20sort_axes%3DFalse%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20save_path%3A%20Optional%5BPath%5D%20%3D%20None)%3A%0A%20%20%20%20%20%20%20%20%22%22%22%0A%20%20%20%20%20%20%20%20Create%20a%20grid%20of%20multiple%20comparison%20of%20means%20plots%20using%20Tukey%20HSD%20test%20results.%0A%0A%20%20%20%20%20%20%20%20Parameters%3A%0A%20%20%20%20%20%20%20%20df%20(pd.DataFrame)%3A%20Input%20dataframe%20containing%20the%20data.%0A%20%20%20%20%20%20%20%20stats%20(list%20of%20str)%3A%20List%20of%20statistical%20metrics%20to%20create%20plots%20for.%0A%20%20%20%20%20%20%20%20group_col%20(str)%3A%20The%20column%20name%20indicating%20the%20groups.%0A%20%20%20%20%20%20%20%20alpha%20(float)%3A%20Significance%20level%20for%20the%20Tukey%20HSD%20test.%20Default%20is%200.05.%0A%20%20%20%20%20%20%20%20figsize%20(tuple)%3A%20Size%20of%20the%20figure.%20Default%20is%20(20%2C%2010).%0A%20%20%20%20%20%20%20%20direction_dict%20(dict)%3A%20Dictionary%20indicating%20whether%20to%20minimize%20or%20maximize%20each%20metric.%0A%20%20%20%20%20%20%20%20effect_dict%20(dict)%3A%20Dictionary%20with%20effect%20size%20limits%20for%20each%20metric.%0A%20%20%20%20%20%20%20%20show_diff%20(bool)%3A%20Whether%20to%20show%20the%20mean%20differences%20in%20the%20plot.%20Default%20is%20True.%0A%20%20%20%20%20%20%20%20cell_text_size%20(int)%3A%20Font%20size%20for%20the%20cell%20text.%20Default%20is%2016.%0A%20%20%20%20%20%20%20%20axis_text_size%20(int)%3A%20Font%20size%20for%20the%20axis%20text.%20Default%20is%2012.%0A%20%20%20%20%20%20%20%20title_text_size%20(int)%3A%20Font%20size%20for%20the%20title%20text.%20Default%20is%2016.%0A%20%20%20%20%20%20%20%20sort%20(bool)%3A%20Whether%20to%20sort%20the%20axes.%20Default%20is%20False.%0A%20%20%20%20%20%20%20%20save_path%20(Path%20%7C%20None)%3A%20If%20provided%2C%20the%20figure%20is%20saved%20to%20this%20path%20before%20returning.%0A%0A%20%20%20%20%20%20%20%20Returns%3A%0A%20%20%20%20%20%20%20%20plt.Figure%3A%20The%20figure%20with%20the%20grid%20of%20heatmaps.%0A%20%20%20%20%20%20%20%20%22%22%22%0A%20%20%20%20%20%20%20%20%23%20Use%20a%202-column%20grid%20when%204%20stats%20are%20given%20(perfect%202%C3%972)%3B%20otherwise%203%20columns.%0A%20%20%20%20%20%20%20%20ncol%20%3D%202%20if%20len(stats)%20%3D%3D%204%20else%203%0A%20%20%20%20%20%20%20%20nrow%20%3D%20math.ceil(len(stats)%20%2F%20ncol)%0A%20%20%20%20%20%20%20%20fig%2C%20ax%20%3D%20plt.subplots(nrow%2C%20ncol%2C%20figsize%3Dfigsize)%0A%0A%20%20%20%20%20%20%20%20%23%20Set%20defaults%0A%20%20%20%20%20%20%20%20for%20key%20in%20%5B'r2'%2C%20'rho'%2C%20'prec'%2C%20'recall'%2C%20'mae'%2C%20'mse'%5D%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20direction_dict.setdefault(key%2C%20'maximize'%20if%20key%20in%20%5B'r2'%2C%20'rho'%2C%20'prec'%2C%20'recall'%5D%20else%20'minimize')%0A%0A%20%20%20%20%20%20%20%20for%20key%20in%20%5B'r2'%2C%20'rho'%2C%20'prec'%2C%20'recall'%5D%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20effect_dict.setdefault(key%2C%200.1)%0A%0A%20%20%20%20%20%20%20%20direction_dict%20%3D%20%7Bk.lower()%3A%20v%20for%20k%2C%20v%20in%20direction_dict.items()%7D%0A%20%20%20%20%20%20%20%20effect_dict%20%3D%20%7Bk.lower()%3A%20v%20for%20k%2C%20v%20in%20effect_dict.items()%7D%0A%0A%20%20%20%20%20%20%20%20for%20i%2C%20stat%20in%20enumerate(stats)%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20stat%20%3D%20stat.lower()%0A%0A%20%20%20%20%20%20%20%20%20%20%20%20row%20%3D%20i%20%2F%2F%20ncol%0A%20%20%20%20%20%20%20%20%20%20%20%20col%20%3D%20i%20%25%20ncol%0A%0A%20%20%20%20%20%20%20%20%20%20%20%20if%20stat%20not%20in%20direction_dict%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20raise%20ValueError(f%22Stat%20'%7Bstat%7D'%20is%20missing%20in%20direction_dict.%20Please%20set%20its%20value.%22)%0A%20%20%20%20%20%20%20%20%20%20%20%20if%20stat%20not%20in%20effect_dict%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20raise%20ValueError(f%22Stat%20'%7Bstat%7D'%20is%20missing%20in%20effect_dict.%20Please%20set%20its%20value.%22)%0A%0A%20%20%20%20%20%20%20%20%20%20%20%20reverse_cmap%20%3D%20False%0A%20%20%20%20%20%20%20%20%20%20%20%20if%20direction_dict%5Bstat%5D%20%3D%3D%20'minimize'%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20reverse_cmap%20%3D%20True%0A%0A%20%20%20%20%20%20%20%20%20%20%20%20_%2C%20df_means%2C%20df_means_diff%2C%20pc%20%3D%20rm_tukey_hsd(df%2C%20stat%2C%20group_col%2C%20alpha%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20sort_axes%2C%20direction_dict)%0A%0A%20%20%20%20%20%20%20%20%20%20%20%20hax%20%3D%20mcs_plot(pc%2C%20effect_size%3Ddf_means_diff%2C%20means%3Ddf_means%5Bstat%5D%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20show_diff%3Dshow_diff%2C%20ax%3Dax%5Brow%2C%20col%5D%2C%20cbar%3DTrue%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20cell_text_size%3Dcell_text_size%2C%20axis_text_size%3Daxis_text_size%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20reverse_cmap%3Dreverse_cmap%2C%20vlim%3Deffect_dict%5Bstat%5D)%0A%20%20%20%20%20%20%20%20%20%20%20%20hax.set_title(stat.upper()%2C%20fontsize%3Dtitle_text_size)%0A%0A%20%20%20%20%20%20%20%20%23%20If%20there%20are%20less%20plots%20than%20cells%20in%20the%20grid%2C%20hide%20the%20remaining%20cells%0A%20%20%20%20%20%20%20%20if%20(len(stats)%20%25%20ncol)%20!%3D%200%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20for%20i%20in%20range(len(stats)%2C%20nrow%20*%20ncol)%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20row%20%3D%20i%20%2F%2F%20ncol%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20col%20%3D%20i%20%25%20ncol%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20ax%5Brow%2C%20col%5D.set_visible(False)%0A%0A%20%20%20%20%20%20%20%20fig.tight_layout()%0A%20%20%20%20%20%20%20%20if%20save_path%20is%20not%20None%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20fig.savefig(save_path%2C%20dpi%3D300%2C%20bbox_inches%3D%22tight%22)%0A%20%20%20%20%20%20%20%20return%20fig%0A%0A%0A%20%20%20%20return%20make_mcs_plot_grid%2C%20make_normality_diagnostic%0A%0A%0A%40app.cell%0Adef%20_(%0A%20%20%20%20Optional%2C%0A%20%20%20%20Path%2C%0A%20%20%20%20calc_regression_metrics%2C%0A%20%20%20%20np%2C%0A%20%20%20%20pl%2C%0A%20%20%20%20plt%2C%0A%20%20%20%20precision_score%2C%0A%20%20%20%20recall_score%2C%0A%20%20%20%20rm_tukey_hsd%2C%0A%20%20%20%20sns%2C%0A)%3A%0A%20%20%20%20def%20make_scatterplot(%0A%20%20%20%20%20%20%20%20df%3A%20pl.DataFrame%2C%0A%20%20%20%20%20%20%20%20val_col%3A%20str%2C%0A%20%20%20%20%20%20%20%20pred_col%3A%20str%2C%0A%20%20%20%20%20%20%20%20thresh%3A%20float%2C%0A%20%20%20%20%20%20%20%20cycle_col%3A%20str%20%3D%20%22cv_cycle%22%2C%0A%20%20%20%20%20%20%20%20group_col%3A%20str%20%3D%20%22method%22%2C%0A%20%20%20%20%20%20%20%20save_path%3A%20Optional%5BPath%5D%20%3D%20None%2C%0A%20%20%20%20)%20-%3E%20plt.Figure%3A%0A%20%20%20%20%20%20%20%20%22%22%22%0A%20%20%20%20%20%20%20%20Create%20scatter%20plots%20for%20each%20method%20showing%20the%20relationship%20between%20predicted%20and%20measured%20values.%0A%0A%20%20%20%20%20%20%20%20Args%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20df%3A%20Polars%20DataFrame%20with%20columns%20%5Bgroup_col%2C%20cycle_col%2C%20val_col%2C%20pred_col%5D.%0A%20%20%20%20%20%20%20%20%20%20%20%20val_col%3A%20Column%20name%20for%20the%20ground%20truth%20values.%0A%20%20%20%20%20%20%20%20%20%20%20%20pred_col%3A%20Column%20name%20for%20the%20model%20predictions.%0A%20%20%20%20%20%20%20%20%20%20%20%20thresh%3A%20Decision%20threshold%20for%20binary%20precision%2Frecall%20computation.%0A%20%20%20%20%20%20%20%20%20%20%20%20cycle_col%3A%20Column%20indicating%20the%20cross-validation%20fold.%20Default%20is%20%22cv_cycle%22.%0A%20%20%20%20%20%20%20%20%20%20%20%20group_col%3A%20Column%20indicating%20the%20comparison%20groups%2Fmethods.%20Default%20is%20%22method%22.%0A%20%20%20%20%20%20%20%20%20%20%20%20save_path%3A%20If%20provided%2C%20the%20figure%20is%20saved%20to%20this%20path%20before%20returning.%0A%0A%20%20%20%20%20%20%20%20Returns%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20Matplotlib%20Figure.%0A%20%20%20%20%20%20%20%20%22%22%22%0A%20%20%20%20%20%20%20%20df_split_metrics%20%3D%20calc_regression_metrics(%0A%20%20%20%20%20%20%20%20%20%20%20%20df%2C%20cycle_col%3Dcycle_col%2C%20val_col%3Dval_col%2C%20pred_col%3Dpred_col%2C%20thresh%3Dthresh%0A%20%20%20%20%20%20%20%20)%0A%20%20%20%20%20%20%20%20methods%20%3D%20df%5Bgroup_col%5D.unique().to_list()%0A%0A%20%20%20%20%20%20%20%20fig%2C%20axs%20%3D%20plt.subplots(nrows%3D3%2C%20ncols%3D2%2C%20figsize%3D(14%2C%2018))%0A%20%20%20%20%20%20%20%20axs_flat%20%3D%20axs.flatten()%0A%0A%20%20%20%20%20%20%20%20for%20ax%2C%20method%20in%20zip(axs_flat%2C%20methods)%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20%23%20Filter%20using%20Polars%20expressions%0A%20%20%20%20%20%20%20%20%20%20%20%20df_method%20%3D%20df.filter(pl.col(group_col)%20%3D%3D%20method)%0A%20%20%20%20%20%20%20%20%20%20%20%20df_metrics%20%3D%20df_split_metrics.filter(pl.col(group_col)%20%3D%3D%20method)%0A%0A%20%20%20%20%20%20%20%20%20%20%20%20y_true_vals%20%3D%20df_method%5Bval_col%5D.to_numpy()%0A%20%20%20%20%20%20%20%20%20%20%20%20y_pred_vals%20%3D%20df_method%5Bpred_col%5D.to_numpy()%0A%0A%20%20%20%20%20%20%20%20%20%20%20%20ax.scatter(y_pred_vals%2C%20y_true_vals%2C%20alpha%3D0.3)%0A%20%20%20%20%20%20%20%20%20%20%20%20ax.plot(%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%5By_true_vals.min()%2C%20y_true_vals.max()%5D%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%5By_true_vals.min()%2C%20y_true_vals.max()%5D%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20'k--'%2C%20lw%3D1%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20)%0A%0A%20%20%20%20%20%20%20%20%20%20%20%20ax.axhline(y%3Dthresh%2C%20color%3D'r'%2C%20linestyle%3D'--')%0A%20%20%20%20%20%20%20%20%20%20%20%20ax.axvline(x%3Dthresh%2C%20color%3D'r'%2C%20linestyle%3D'--')%0A%20%20%20%20%20%20%20%20%20%20%20%20ax.set_title(method)%0A%0A%20%20%20%20%20%20%20%20%20%20%20%20precision%20%3D%20precision_score(y_true_vals%20%3E%20thresh%2C%20y_pred_vals%20%3E%20thresh)%0A%20%20%20%20%20%20%20%20%20%20%20%20recall%20%3D%20recall_score(y_true_vals%20%3E%20thresh%2C%20y_pred_vals%20%3E%20thresh)%0A%0A%20%20%20%20%20%20%20%20%20%20%20%20%23%20Aggregate%20mean%20metrics%20across%20CV%20folds%20for%20the%20annotation%0A%20%20%20%20%20%20%20%20%20%20%20%20mae_mean%20%20%3D%20df_metrics%5B%22mae%22%5D.mean()%0A%20%20%20%20%20%20%20%20%20%20%20%20mse_mean%20%20%3D%20df_metrics%5B%22mse%22%5D.mean()%0A%20%20%20%20%20%20%20%20%20%20%20%20r2_mean%20%20%20%3D%20df_metrics%5B%22r2%22%5D.mean()%0A%20%20%20%20%20%20%20%20%20%20%20%20rho_mean%20%20%3D%20df_metrics%5B%22rho%22%5D.mean()%0A%20%20%20%20%20%20%20%20%20%20%20%20metrics_text%20%3D%20(%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20f%22MAE%3A%20%7Bmae_mean%3A.2f%7D%5CnMSE%3A%20%7Bmse_mean%3A.2f%7D%5Cn%22%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20f%22R2%3A%20%7Br2_mean%3A.2f%7D%5Cnrho%3A%20%7Brho_mean%3A.2f%7D%5Cn%22%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20f%22Precision%3A%20%7Bprecision%3A.2f%7D%5CnRecall%3A%20%7Brecall%3A.2f%7D%22%0A%20%20%20%20%20%20%20%20%20%20%20%20)%0A%20%20%20%20%20%20%20%20%20%20%20%20ax.text(0.05%2C%20.5%2C%20metrics_text%2C%20transform%3Dax.transAxes%2C%20verticalalignment%3D'top')%0A%20%20%20%20%20%20%20%20%20%20%20%20ax.set_xlabel('Predicted')%0A%20%20%20%20%20%20%20%20%20%20%20%20ax.set_ylabel('Measured')%0A%0A%20%20%20%20%20%20%20%20for%20ax%20in%20axs_flat%5Blen(methods)%3A%5D%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20ax.set_visible(False)%0A%0A%20%20%20%20%20%20%20%20fig.tight_layout()%0A%20%20%20%20%20%20%20%20if%20save_path%20is%20not%20None%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20fig.savefig(save_path%2C%20dpi%3D300%2C%20bbox_inches%3D%22tight%22)%0A%20%20%20%20%20%20%20%20return%20fig%0A%0A%0A%20%20%20%20def%20ci_plot(result_tab%2C%20ax_in%2C%20name%3A%20str%2C%20show_ylabel%3A%20bool%20%3D%20True)%20-%3E%20None%3A%0A%20%20%20%20%20%20%20%20%22%22%22%0A%20%20%20%20%20%20%20%20Create%20a%20confidence%20interval%20plot%20for%20the%20given%20result%20table.%0A%0A%20%20%20%20%20%20%20%20result_tab%20is%20a%20pandas%20DataFrame%20produced%20by%20rm_tukey_hsd%20%E2%80%94%20seaborn's%0A%20%20%20%20%20%20%20%20pointplot%20and%20errorbar%20require%20pandas%20Series%20for%20its%20index%20labels.%0A%0A%20%20%20%20%20%20%20%20Args%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20result_tab%3A%20pandas%20DataFrame%20with%20columns%20%5B'meandiff'%2C%20'lower'%2C%20'upper'%5D.%0A%20%20%20%20%20%20%20%20%20%20%20%20ax_in%3A%20Matplotlib%20Axes%20on%20which%20to%20draw%20the%20plot.%0A%20%20%20%20%20%20%20%20%20%20%20%20name%3A%20Title%20string%20for%20the%20subplot.%0A%20%20%20%20%20%20%20%20%20%20%20%20show_ylabel%3A%20Whether%20to%20show%20y-axis%20tick%20labels.%20Set%20False%20for%20right-column%20axes.%0A%20%20%20%20%20%20%20%20%22%22%22%0A%20%20%20%20%20%20%20%20result_err%20%3D%20np.array(%5B%0A%20%20%20%20%20%20%20%20%20%20%20%20result_tab%5B'meandiff'%5D%20-%20result_tab%5B'lower'%5D%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20result_tab%5B'upper'%5D%20-%20result_tab%5B'meandiff'%5D%2C%0A%20%20%20%20%20%20%20%20%5D)%0A%20%20%20%20%20%20%20%20sns.set(rc%3D%7B'figure.figsize'%3A%20(6%2C%202)%7D)%0A%20%20%20%20%20%20%20%20sns.set_context('notebook')%0A%20%20%20%20%20%20%20%20sns.set_style('whitegrid')%0A%20%20%20%20%20%20%20%20ax%20%3D%20sns.pointplot(x%3Dresult_tab.meandiff%2C%20y%3Dresult_tab.index%2C%20marker%3D'o'%2C%20linestyle%3D''%2C%20ax%3Dax_in)%0A%20%20%20%20%20%20%20%20ax.errorbar(y%3Dresult_tab.index%2C%20x%3Dresult_tab%5B'meandiff'%5D%2C%20xerr%3Dresult_err%2C%20fmt%3D'o'%2C%20capsize%3D5)%0A%20%20%20%20%20%20%20%20ax.axvline(0%2C%20ls%3D%22--%22%2C%20lw%3D3)%0A%20%20%20%20%20%20%20%20ax.set_xlabel(%22Mean%20Difference%22)%0A%20%20%20%20%20%20%20%20ax.set_ylabel(%22%22)%0A%20%20%20%20%20%20%20%20ax.set_title(name)%0A%20%20%20%20%20%20%20%20ax.set_xlim(-0.2%2C%200.2)%0A%20%20%20%20%20%20%20%20if%20not%20show_ylabel%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20ax.set_yticklabels(%5B%5D)%0A%0A%0A%20%20%20%20def%20make_ci_plot_grid(%0A%20%20%20%20%20%20%20%20df_in%3A%20pl.DataFrame%2C%0A%20%20%20%20%20%20%20%20metric_list%3A%20list%5Bstr%5D%2C%0A%20%20%20%20%20%20%20%20group_col%3A%20str%20%3D%20%22method%22%2C%0A%20%20%20%20%20%20%20%20save_path%3A%20Optional%5BPath%5D%20%3D%20None%2C%0A%20%20%20%20)%20-%3E%20plt.Figure%3A%0A%20%20%20%20%20%20%20%20%22%22%22%0A%20%20%20%20%20%20%20%20Create%20a%20grid%20of%20confidence%20interval%20plots%20for%20multiple%20metrics%20using%20Tukey%20HSD%20test%20results.%0A%0A%20%20%20%20%20%20%20%20Args%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20df_in%3A%20Polars%20DataFrame%20passed%20through%20to%20rm_tukey_hsd%20(converted%20internally).%0A%20%20%20%20%20%20%20%20%20%20%20%20metric_list%3A%20List%20of%20metric%20column%20names%20to%20create%20confidence%20interval%20plots%20for.%0A%20%20%20%20%20%20%20%20%20%20%20%20group_col%3A%20Column%20indicating%20the%20comparison%20groups.%20Default%20is%20%22method%22.%0A%20%20%20%20%20%20%20%20%20%20%20%20save_path%3A%20If%20provided%2C%20the%20figure%20is%20saved%20to%20this%20path%20before%20returning.%0A%0A%20%20%20%20%20%20%20%20Returns%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20Matplotlib%20Figure.%0A%20%20%20%20%20%20%20%20%22%22%22%0A%20%20%20%20%20%20%20%20figure%2C%20axes%20%3D%20plt.subplots(2%2C%202%2C%20figsize%3D(14%2C%2012)%2C%20sharex%3DFalse)%0A%20%20%20%20%20%20%20%20for%20i%2C%20metric%20in%20enumerate(metric_list)%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20row%2C%20col%20%3D%20i%20%2F%2F%202%2C%20i%20%25%202%0A%20%20%20%20%20%20%20%20%20%20%20%20df_tukey%2C%20_%2C%20_%2C%20_%20%3D%20rm_tukey_hsd(df_in%2C%20metric%2C%20group_col%3Dgroup_col)%0A%20%20%20%20%20%20%20%20%20%20%20%20ci_plot(df_tukey%2C%20ax_in%3Daxes%5Brow%2C%20col%5D%2C%20name%3Dmetric%2C%20show_ylabel%3D(col%20%3D%3D%200))%0A%20%20%20%20%20%20%20%20for%20ax%20in%20axes.flatten()%5Blen(metric_list)%3A%5D%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20ax.set_visible(False)%0A%20%20%20%20%20%20%20%20figure.suptitle(%22Multiple%20Comparison%20of%20Means%5CnTukey%20HSD%2C%20FWER%3D0.05%22)%0A%20%20%20%20%20%20%20%20figure.tight_layout()%0A%20%20%20%20%20%20%20%20if%20save_path%20is%20not%20None%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20figure.savefig(save_path%2C%20dpi%3D300%2C%20bbox_inches%3D%22tight%22)%0A%20%20%20%20%20%20%20%20return%20figure%0A%0A%20%20%20%20return%20make_ci_plot_grid%2C%20make_scatterplot%0A%0A%0A%40app.cell%0Adef%20_(mo)%3A%0A%20%20%20%20mo.md(r%22%22%22%0A%20%20%20%20%23%20Read%20train%20dataset%20and%20test%20different%20data%20splits%0A%20%20%20%20%22%22%22)%0A%20%20%20%20return%0A%0A%0A%40app.cell%0Adef%20_(pl)%3A%0A%20%20%20%20all_compounds%20%3D%20pl.read_csv(%22..%2Fdata%2Fprocessed%2Fall_compounds_activity_data.csv%22)%0A%20%20%20%20all_compounds%0A%20%20%20%20return%20(all_compounds%2C)%0A%0A%0A%40app.cell%0Adef%20_(all_compounds%2C%20pl)%3A%0A%20%20%20%20whole_train%20%3D%20all_compounds.filter(pl.col(%22pEC50_dr%22).is_not_null())%0A%20%20%20%20whole_train%0A%20%20%20%20return%20(whole_train%2C)%0A%0A%0A%40app.cell%0Adef%20_(mo)%3A%0A%20%20%20%20mo.md(r%22%22%22%0A%20%20%20%20%23%20Comparing%20data%20split%20strategies%20via%20train%2Ftest%20Tanimoto%20similarity%0A%0A%20%20%20%20For%20each%20split%20method%20we%20run%20one%20outer%20round%20of%205-fold%20CV%20and%20compute%2C%20for%20every%0A%20%20%20%20test%20compound%2C%20its%20Tanimoto%20similarity%20to%20all%20training%20compounds%20(ECFP4%2C%20radius%202%2C%0A%20%20%20%202048%20bits).%20%20Two%20views%20are%20compared%3A%0A%0A%20%20%20%20-%20**All%20pairs**%20%E2%80%94%20every%20(test%2C%20train)%20similarity%20value%2C%20giving%20a%20sense%20of%20the%20full%0A%20%20%20%20%20%20similarity%20distribution%20the%20model%20is%20exposed%20to%0A%20%20%20%20-%20**Nearest%20neighbour**%20%E2%80%94%20only%20the%20maximum%20similarity%20per%20test%20compound%2C%20which%0A%20%20%20%20%20%20directly%20measures%20how%20%22close%22%20to%20training%20data%20each%20prediction%20will%20be%0A%0A%20%20%20%20A%20well-separated%20scaffold%20or%20temporal%20split%20should%20shift%20both%20distributions%20leftward%0A%20%20%20%20relative%20to%20the%20random%20split.%0A%20%20%20%20%22%22%22)%0A%20%20%20%20return%0A%0A%0A%40app.cell%0Adef%20_(%0A%20%20%20%20DataStructs%2C%0A%20%20%20%20ExplicitBitVect%2C%0A%20%20%20%20generate_cv_splits_random%2C%0A%20%20%20%20generate_cv_splits_scaffold%2C%0A%20%20%20%20generate_cv_splits_temporal%2C%0A%20%20%20%20generate_fingerprint%2C%0A%20%20%20%20np%2C%0A%20%20%20%20pl%2C%0A%20%20%20%20whole_train%2C%0A)%3A%0A%20%20%20%20def%20_to_rdkit_bitvects(df%3A%20pl.DataFrame)%20-%3E%20list%5BExplicitBitVect%5D%3A%0A%20%20%20%20%20%20%20%20%22%22%22%0A%20%20%20%20%20%20%20%20Convert%20the%20%22ecfp%22%20uint8%20numpy%20array%20column%20(added%20by%20generate_fingerprint)%0A%20%20%20%20%20%20%20%20to%20a%20list%20of%20RDKit%20ExplicitBitVect%20objects%20required%20by%20BulkTanimotoSimilarity.%0A%20%20%20%20%20%20%20%20%22%22%22%0A%20%20%20%20%20%20%20%20fp_size%20%3D%20len(df%5B%22ecfp%22%5D%5B0%5D)%0A%20%20%20%20%20%20%20%20bitvects%20%3D%20%5B%5D%0A%20%20%20%20%20%20%20%20for%20arr%20in%20df%5B%22ecfp%22%5D.to_list()%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20bv%20%3D%20ExplicitBitVect(fp_size)%0A%20%20%20%20%20%20%20%20%20%20%20%20for%20i%20in%20np.flatnonzero(arr)%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20bv.SetBit(int(i))%0A%20%20%20%20%20%20%20%20%20%20%20%20bitvects.append(bv)%0A%20%20%20%20%20%20%20%20return%20bitvects%0A%0A%20%20%20%20def%20_fold_similarities(train_df%3A%20pl.DataFrame%2C%20test_df%3A%20pl.DataFrame)%20-%3E%20dict%5Bstr%2C%20list%5Bfloat%5D%5D%3A%0A%20%20%20%20%20%20%20%20%22%22%22%0A%20%20%20%20%20%20%20%20Compute%20all-pairs%20and%20nearest-neighbour%20Tanimoto%20similarities%20between%0A%20%20%20%20%20%20%20%20test%20and%20train%20fingerprints%20for%20a%20single%20fold.%0A%0A%20%20%20%20%20%20%20%20Fingerprints%20are%20ECFP4%20(radius%3D2%2C%20fp_size%3D2048)%20generated%20via%0A%20%20%20%20%20%20%20%20generate_fingerprint%20and%20converted%20to%20RDKit%20ExplicitBitVect%20for%0A%20%20%20%20%20%20%20%20BulkTanimotoSimilarity.%0A%0A%20%20%20%20%20%20%20%20Returns%20a%20dict%20with%20keys%20%22all%22%20and%20%22nn%22%2C%20each%20a%20flat%20list%20of%20floats.%0A%20%20%20%20%20%20%20%20%22%22%22%0A%20%20%20%20%20%20%20%20train_fps%20%3D%20_to_rdkit_bitvects(generate_fingerprint(train_df%2C%20%22ecfp%22%2C%20radius%3D2%2C%20fp_size%3D2048))%0A%20%20%20%20%20%20%20%20test_fps%20%20%3D%20_to_rdkit_bitvects(generate_fingerprint(test_df%2C%20%20%22ecfp%22%2C%20radius%3D2%2C%20fp_size%3D2048))%0A%0A%20%20%20%20%20%20%20%20all_sims%3A%20list%5Bfloat%5D%20%3D%20%5B%5D%0A%20%20%20%20%20%20%20%20nn_sims%3A%20%20list%5Bfloat%5D%20%3D%20%5B%5D%0A%0A%20%20%20%20%20%20%20%20for%20test_fp%20in%20test_fps%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20sims%20%3D%20DataStructs.BulkTanimotoSimilarity(test_fp%2C%20train_fps)%0A%20%20%20%20%20%20%20%20%20%20%20%20all_sims.extend(sims)%0A%20%20%20%20%20%20%20%20%20%20%20%20nn_sims.append(float(np.max(sims)))%0A%0A%20%20%20%20%20%20%20%20return%20%7B%22all%22%3A%20all_sims%2C%20%22nn%22%3A%20nn_sims%7D%0A%0A%20%20%20%20%23%20%E2%94%80%E2%94%80%20collect%20similarities%20across%20all%20three%20split%20methods%20%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%0A%0A%20%20%20%20records%3A%20list%5Bdict%5D%20%3D%20%5B%5D%0A%0A%20%20%20%20%23%20Random%201%C3%975%20CV%20%20(n_outer%3D1%2C%20n_inner%3D5)%0A%20%20%20%20for%20_fold%2C%20_outer%2C%20_inner%2C%20_train%2C%20_val%2C%20_test%20in%20generate_cv_splits_random(%0A%20%20%20%20%20%20%20%20whole_train%2C%20n_outer%3D1%2C%20n_inner%3D5%2C%20seed%3D42%0A%20%20%20%20)%3A%0A%20%20%20%20%20%20%20%20_sims%20%3D%20_fold_similarities(_train%2C%20_test)%0A%20%20%20%20%20%20%20%20for%20_s%20in%20_sims%5B%22all%22%5D%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20records.append(%7B%22split%22%3A%20%22random%22%2C%20%22mode%22%3A%20%22all%20pairs%22%2C%20%20%20%20%20%20%20%20%22tanimoto%22%3A%20_s%7D)%0A%20%20%20%20%20%20%20%20for%20_s%20in%20_sims%5B%22nn%22%5D%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20records.append(%7B%22split%22%3A%20%22random%22%2C%20%22mode%22%3A%20%22nearest%20neighbour%22%2C%22tanimoto%22%3A%20_s%7D)%0A%0A%20%20%20%20%23%20Scaffold%201%C3%975%20CV%0A%20%20%20%20for%20_fold%2C%20_outer%2C%20_inner%2C%20_train%2C%20_val%2C%20_test%20in%20generate_cv_splits_scaffold(%0A%20%20%20%20%20%20%20%20whole_train%2C%20n_outer%3D1%2C%20n_inner%3D5%2C%20seed%3D42%0A%20%20%20%20)%3A%0A%20%20%20%20%20%20%20%20_sims%20%3D%20_fold_similarities(_train%2C%20_test)%0A%20%20%20%20%20%20%20%20for%20_s%20in%20_sims%5B%22all%22%5D%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20records.append(%7B%22split%22%3A%20%22scaffold%22%2C%20%22mode%22%3A%20%22all%20pairs%22%2C%20%20%20%20%20%20%20%20%22tanimoto%22%3A%20_s%7D)%0A%20%20%20%20%20%20%20%20for%20_s%20in%20_sims%5B%22nn%22%5D%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20records.append(%7B%22split%22%3A%20%22scaffold%22%2C%20%22mode%22%3A%20%22nearest%20neighbour%22%2C%22tanimoto%22%3A%20_s%7D)%0A%0A%20%20%20%20%23%20Temporal%205-fold%20CV%0A%20%20%20%20for%20_fold%2C%20_train%2C%20_val%2C%20_test%20in%20generate_cv_splits_temporal(%0A%20%20%20%20%20%20%20%20whole_train%2C%20n_folds%3D5%2C%20seed%3D42%0A%20%20%20%20)%3A%0A%20%20%20%20%20%20%20%20_sims%20%3D%20_fold_similarities(_train%2C%20_test)%0A%20%20%20%20%20%20%20%20for%20_s%20in%20_sims%5B%22all%22%5D%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20records.append(%7B%22split%22%3A%20%22temporal%22%2C%20%22mode%22%3A%20%22all%20pairs%22%2C%20%20%20%20%20%20%20%20%22tanimoto%22%3A%20_s%7D)%0A%20%20%20%20%20%20%20%20for%20_s%20in%20_sims%5B%22nn%22%5D%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20records.append(%7B%22split%22%3A%20%22temporal%22%2C%20%22mode%22%3A%20%22nearest%20neighbour%22%2C%22tanimoto%22%3A%20_s%7D)%0A%0A%20%20%20%20sim_df%20%3D%20pl.DataFrame(records)%0A%20%20%20%20return%20(sim_df%2C)%0A%0A%0A%40app.cell%0Adef%20_(Path%2C%20mo%2C%20mpatches%2C%20plt%2C%20sim_df)%3A%0A%20%20%20%20_splits%20%20%3D%20%5B%22random%22%2C%20%22scaffold%22%2C%20%22temporal%22%5D%0A%20%20%20%20_modes%20%20%20%3D%20%5B%22all%20pairs%22%2C%20%22nearest%20neighbour%22%5D%0A%20%20%20%20_colors%20%20%3D%20%7B%22random%22%3A%20%22%234C78A8%22%2C%20%22scaffold%22%3A%20%22%23F58518%22%2C%20%22temporal%22%3A%20%22%2354A24B%22%7D%0A%20%20%20%20_labels%20%20%3D%20%7B%22all%20pairs%22%3A%20%22All%20pairs%22%2C%20%22nearest%20neighbour%22%3A%20%22Nearest%20neighbour%22%7D%0A%0A%20%20%20%20_fig%2C%20_axes%20%3D%20plt.subplots(1%2C%202%2C%20figsize%3D(10%2C%205)%2C%20sharey%3DTrue)%0A%20%20%20%20_fig.suptitle(%22Train%2Ftest%20Tanimoto%20similarity%20by%20split%20strategy%20(ECFP4)%22%2C%20fontsize%3D13)%0A%0A%20%20%20%20for%20_ax%2C%20_mode%20in%20zip(_axes%2C%20_modes)%3A%0A%20%20%20%20%20%20%20%20%23%20Build%20one%20data%20list%20and%20one%20colour%20list%20per%20split%2C%20in%20fixed%20order%0A%20%20%20%20%20%20%20%20_data%20%20%20%3D%20%5B%0A%20%20%20%20%20%20%20%20%20%20%20%20sim_df.filter((sim_df%5B%22split%22%5D%20%3D%3D%20_s)%20%26%20(sim_df%5B%22mode%22%5D%20%3D%3D%20_mode))%5B%22tanimoto%22%5D.to_list()%0A%20%20%20%20%20%20%20%20%20%20%20%20for%20_s%20in%20_splits%0A%20%20%20%20%20%20%20%20%5D%0A%20%20%20%20%20%20%20%20_bp%20%3D%20_ax.boxplot(%0A%20%20%20%20%20%20%20%20%20%20%20%20_data%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20patch_artist%3DTrue%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20showfliers%3DFalse%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20medianprops%3D%7B%22color%22%3A%20%22black%22%2C%20%22linewidth%22%3A%201.5%7D%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20whiskerprops%3D%7B%22linewidth%22%3A%201.2%7D%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20capprops%3D%7B%22linewidth%22%3A%201.2%7D%2C%0A%20%20%20%20%20%20%20%20)%0A%20%20%20%20%20%20%20%20for%20_patch%2C%20_split%20in%20zip(_bp%5B%22boxes%22%5D%2C%20_splits)%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20_patch.set_facecolor(_colors%5B_split%5D)%0A%20%20%20%20%20%20%20%20%20%20%20%20_patch.set_alpha(0.8)%0A%0A%20%20%20%20%20%20%20%20_ax.set_title(_labels%5B_mode%5D%2C%20fontsize%3D12)%0A%20%20%20%20%20%20%20%20_ax.set_xticks(%5B1%2C%202%2C%203%5D)%0A%20%20%20%20%20%20%20%20_ax.set_xticklabels(_splits%2C%20fontsize%3D11)%0A%20%20%20%20%20%20%20%20_ax.set_ylim(0%2C%201)%0A%20%20%20%20%20%20%20%20_ax.set_ylabel(%22Tanimoto%20similarity%22%2C%20fontsize%3D11)%0A%20%20%20%20%20%20%20%20_ax.set_xlabel(%22Split%20strategy%22%2C%20fontsize%3D11)%0A%20%20%20%20%20%20%20%20_ax.yaxis.grid(True%2C%20linestyle%3D%22--%22%2C%20alpha%3D0.6)%0A%20%20%20%20%20%20%20%20_ax.set_axisbelow(True)%0A%0A%20%20%20%20%23%20shared%20legend%0A%20%20%20%20_handles%20%3D%20%5Bmpatches.Patch(facecolor%3D_colors%5Bs%5D%2C%20alpha%3D0.8%2C%20label%3Ds)%20for%20s%20in%20_splits%5D%0A%20%20%20%20_fig.legend(handles%3D_handles%2C%20loc%3D%22lower%20center%22%2C%20ncol%3D3%2C%20fontsize%3D11%2C%20frameon%3DFalse%2C%20bbox_to_anchor%3D(0.5%2C%20-0.04))%0A%20%20%20%20_fig.tight_layout()%0A%0A%20%20%20%20_PLOT_DIR%20%3D%20Path(%22..%2Fplots%2F2_ml_baseline%22)%0A%20%20%20%20_PLOT_DIR.mkdir(parents%3DTrue%2C%20exist_ok%3DTrue)%0A%20%20%20%20_fig.savefig(_PLOT_DIR%20%2F%20%22tanimoto_similarity_splits.png%22%2C%20bbox_inches%3D%22tight%22)%0A%0A%20%20%20%20mo.as_html(_fig)%0A%20%20%20%20return%0A%0A%0A%40app.cell%0Adef%20_(mo)%3A%0A%20%20%20%20mo.md(r%22%22%22%0A%20%20%20%20All%20three%20splits%20are%20very%20similar.%0A%20%20%20%20For%20the%20scaffold%20split%2C%20this%20is%20likely%20because%20the%20majority%20of%20scaffolds%20are%20present%0A%20%20%20%20in%20only%20one%20or%20a%20very%20small%20number%20of%20molecules.%0A%20%20%20%20For%20a%20more%20challenging%20structure-based%20data%20split%2C%20you%20could%20perform%20clustering%20such%20as%0A%20%20%20%20hierarchical%20clustering%20and%20then%20extract%20five%20distinct%20sets%20of%20molecules.%0A%0A%20%20%20%20The%20temporal%20split%20is%20traditionally%20considered%20a%20more%20realistic%20and%20pessimistic%20split%2C%0A%20%20%20%20but%20that%20assumes%20data%20from%20a%20cyclical%20process%20%E2%80%94%20a%20series%20of%20DMTL%20(Design%2C%20Make%2C%20Test%2C%20Learn)%20cycles%20%E2%80%94%20where%20later%20entries%0A%20%20%20%20expand%20and%20learn%20from%20previous%20ones.%0A%20%20%20%20That%20is%20not%20the%20case%20here%2C%20as%20the%20training%20set%20is%20largely%20a%20subset%20of%20a%20screening%20library.%0A%20%20%20%20Nonetheless%2C%20there%20is%20a%20small%20shift%20to%20lower%20values%2C%20seen%20especially%20in%20the%20NN%20plot.%0A%20%20%20%20If%20I%20wanted%20to%20examine%20this%20in%20more%20detail%2C%20I%20would%20check%20whether%20the%20training%20compounds%0A%20%20%20%20that%20were%20not%20in%20the%20single-dose%20set%20have%20ID%20numbers%20after%20those%20of%20the%20single-dose%20compounds.%0A%20%20%20%20That%20might%20explain%20the%20slight%20shift.%0A%20%20%20%20%22%22%22)%0A%20%20%20%20return%0A%0A%0A%40app.cell%0Adef%20_(gc%2C%20sim_df)%3A%0A%20%20%20%20del%20sim_df%0A%20%20%20%20gc.collect()%0A%20%20%20%20return%0A%0A%0A%40app.cell%0Adef%20_(mo)%3A%0A%20%20%20%20mo.md(r%22%22%22%0A%20%20%20%20%23%20Train%20single%20task%20baseline%20models%0A%0A%20%20%20%20%23%23%205%20%C3%97%205%20random%20cross-validation%0A%0A%20%20%20%20We%20iterate%20over%2025%20folds%20(5%20outer%20%C3%97%205%20inner)%20produced%20by%20%60generate_cv_splits_random%60.%0A%20%20%20%20For%20**each%20fold**%20we%3A%0A%0A%20%20%20%201.%20Generate%20ECFP%20fingerprints%20on%20the%20fold's%20train%20and%20test%20subsets%20(needed%20by%20the%0A%20%20%20%20%20%20%20fingerprint-based%20models).%0A%20%20%20%202.%20Instantiate%20and%20train%20all%20four%20model%20classes%20and%20the%20two%20baselines%20(Mean%20and%20NN)%3A%0A%20%20%20%20%20%20%20-%20%60RandomForestModel%60%20(ECFP%2C%20500%20trees)%0A%20%20%20%20%20%20%20-%20%60BoostedTreesModel%60%20(ECFP%2C%20XGBoost)%0A%20%20%20%20%20%20%20-%20%60ChempropModel%60%20(MPNN%20from%20scratch%2C%2050%20epochs)%0A%20%20%20%20%20%20%20-%20%60ChempropChemeleonModel%60%20(fine-tuned%20CheMeleon%2C%2030%20epochs)%0A%20%20%20%203.%20Predict%20%60pEC50_dr%60%20on%20the%20test%20compounds.%0A%20%20%20%204.%20Collect%20every%20prediction%20into%20a%20long-format%20Polars%20DataFrame%20and%20write%20it%20to%0A%20%20%20%20%20%20%20%60predictions%2Fcv_predictions.csv%60%20once%20all%20folds%20are%20done.%0A%0A%20%20%20%20The%20CSV%20schema%20is%3A%0A%20%20%20%20%60%60%60%0A%20%20%20%20inchikey%20%7C%20molecule_names%20%7C%20smiles%20%7C%20fold%20%7C%20outer_fold%20%7C%20inner_fold%20%7C%0A%20%20%20%20model%20%20%20%20%7C%20y_true%20%20%20%20%20%20%20%20%20%7C%20y_pred%0A%20%20%20%20%60%60%60%0A%20%20%20%20%22%22%22)%0A%20%20%20%20return%0A%0A%0A%40app.cell%0Adef%20_(%0A%20%20%20%20BoostedTreesModel%2C%0A%20%20%20%20ChempropChemeleonModel%2C%0A%20%20%20%20ChempropModel%2C%0A%20%20%20%20MeanBaseline%2C%0A%20%20%20%20NearestNeighbourBaseline%2C%0A%20%20%20%20Path%2C%0A%20%20%20%20RandomForestModel%2C%0A%20%20%20%20extract_fp_matrix%2C%0A%20%20%20%20gc%2C%0A%20%20%20%20generate_cv_splits_random%2C%0A%20%20%20%20generate_fingerprint%2C%0A%20%20%20%20gzip%2C%0A%20%20%20%20pl%2C%0A%20%20%20%20tqdm%2C%0A%20%20%20%20whole_train%2C%0A)%3A%0A%20%20%20%20%23%20%E2%94%80%E2%94%80%20constants%20%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%0A%20%20%20%20_TARGET_COL%20%20%20%3D%20%22pEC50_dr%22%0A%20%20%20%20_PRED_PATH_GZ%20%3D%20Path(%22..%2Fpredictions%2F2_ml_baseline_5x5cv_random_predictions.csv.gz%22)%0A%20%20%20%20_N_OUTER%20%20%20%20%3D%205%0A%20%20%20%20_N_INNER%20%20%20%20%3D%205%0A%20%20%20%20_SEED%20%20%20%20%20%20%20%3D%2042%0A%20%20%20%20_P_VAL%20%20%20%20%20%20%3D%200.1%20%20%20%20%20%20%20%20%20%20%23%20fraction%20of%20train%20kept%20as%20validation%20(XGBoost%20%2F%20Chemprop%20early%20stopping)%0A%20%20%20%20_FP_TYPE%20%20%20%20%3D%20%22ecfp%22%0A%20%20%20%20_FP_KWARGS%20%20%3D%20%7B%22radius%22%3A%202%2C%20%22fp_size%22%3A%202048%7D%0A%0A%20%20%20%20%23%20%E2%94%80%E2%94%80%20model%20names%20%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%0A%20%20%20%20_MODEL_NAMES%20%3D%20%5B%22mean%22%2C%20%22nn1%22%2C%20%22rf%22%2C%20%22gbm%22%2C%20%22chemprop%22%2C%20%22chemeleon%22%5D%0A%0A%20%20%20%20if%20_PRED_PATH_GZ.exists()%3A%0A%20%20%20%20%20%20%20%20print(f%22Predictions%20already%20exist%20at%20%7B_PRED_PATH_GZ%7D%20%E2%80%94%20skipping%20training.%22)%0A%20%20%20%20%20%20%20%20_pred_df%20%3D%20pl.read_csv(_PRED_PATH_GZ)%0A%20%20%20%20else%3A%0A%20%20%20%20%20%20%20%20%23%20%E2%94%80%E2%94%80%20run%20all%2025%20folds%20%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%0A%20%20%20%20%20%20%20%20%23_debug_df%20%3D%20whole_train.sample(n%3D100%2C%20seed%3D_SEED)%20%20%23%20TODO%3A%20remove%20for%20full%20run%0A%20%20%20%20%20%20%20%20_all_records%3A%20list%5Bdict%5D%20%3D%20%5B%5D%0A%20%20%20%20%20%20%20%20_n_folds%20%3D%20_N_OUTER%20*%20_N_INNER%0A%0A%20%20%20%20%20%20%20%20_pbar%20%3D%20tqdm(%0A%20%20%20%20%20%20%20%20%20%20%20%20generate_cv_splits_random(%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20whole_train%2C%20n_outer%3D_N_OUTER%2C%20n_inner%3D_N_INNER%2C%20seed%3D_SEED%2C%20p_val%3D_P_VAL%0A%20%20%20%20%20%20%20%20%20%20%20%20)%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20total%3D_n_folds%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20desc%3D%22CV%20folds%22%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20unit%3D%22fold%22%2C%0A%20%20%20%20%20%20%20%20)%0A%0A%20%20%20%20%20%20%20%20for%20_fold%2C%20_outer%2C%20_inner%2C%20_train_raw%2C%20_val_raw%2C%20_test_raw%20in%20_pbar%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20%23%20Generate%20fingerprints%20once%20per%20fold%20%E2%80%94%20RF%20and%20GBM%20share%20these%20arrays%0A%20%20%20%20%20%20%20%20%20%20%20%20_train_fp%20%3D%20generate_fingerprint(_train_raw%2C%20_FP_TYPE%2C%20**_FP_KWARGS)%0A%20%20%20%20%20%20%20%20%20%20%20%20_val_fp%20%20%20%3D%20generate_fingerprint(_val_raw%2C%20%20%20_FP_TYPE%2C%20**_FP_KWARGS)%0A%20%20%20%20%20%20%20%20%20%20%20%20_test_fp%20%20%3D%20generate_fingerprint(_test_raw%2C%20%20_FP_TYPE%2C%20**_FP_KWARGS)%0A%0A%20%20%20%20%20%20%20%20%20%20%20%20%23%20Extract%20numpy%20arrays%20used%20by%20fingerprint-based%20models%0A%20%20%20%20%20%20%20%20%20%20%20%20_X_train%20%3D%20extract_fp_matrix(_train_fp%2C%20_FP_TYPE)%0A%20%20%20%20%20%20%20%20%20%20%20%20_X_val%20%20%20%3D%20extract_fp_matrix(_val_fp%2C%20%20%20_FP_TYPE)%0A%20%20%20%20%20%20%20%20%20%20%20%20_X_test%20%20%3D%20extract_fp_matrix(_test_fp%2C%20%20_FP_TYPE)%0A%20%20%20%20%20%20%20%20%20%20%20%20_y_train%20%3D%20_train_raw%5B_TARGET_COL%5D.to_numpy()%0A%20%20%20%20%20%20%20%20%20%20%20%20_y_val%20%20%20%3D%20_val_raw%5B_TARGET_COL%5D.to_numpy()%0A%20%20%20%20%20%20%20%20%20%20%20%20_y_true%20%20%3D%20_test_raw%5B_TARGET_COL%5D.to_numpy()%0A%0A%20%20%20%20%20%20%20%20%20%20%20%20%23%20Extract%20SMILES%20lists%20used%20by%20Chemprop-based%20models%0A%20%20%20%20%20%20%20%20%20%20%20%20_smi_train%20%3D%20_train_raw%5B%22smiles%22%5D.to_list()%0A%20%20%20%20%20%20%20%20%20%20%20%20_smi_val%20%20%20%3D%20_val_raw%5B%22smiles%22%5D.to_list()%0A%20%20%20%20%20%20%20%20%20%20%20%20_smi_test%20%20%3D%20_test_raw%5B%22smiles%22%5D.to_list()%0A%0A%20%20%20%20%20%20%20%20%20%20%20%20%23%20%E2%94%80%E2%94%80%20train%20%26%20predict%20each%20model%20%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%0A%20%20%20%20%20%20%20%20%20%20%20%20for%20_model_name%20in%20_MODEL_NAMES%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20_pbar.set_postfix(%7B%22fold%22%3A%20_fold%2C%20%22o%22%3A%20_outer%2C%20%22i%22%3A%20_inner%2C%20%22model%22%3A%20_model_name%7D%2C%20refresh%3DFalse)%0A%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20if%20_model_name%20%3D%3D%20%22mean%22%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20_model%20%3D%20MeanBaseline()%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20_model.train(_X_train%2C%20_y_train)%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20_y_pred%20%3D%20_model.predict(_X_test)%0A%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20elif%20_model_name%20%3D%3D%20%22nn1%22%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20_model%20%3D%20NearestNeighbourBaseline()%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20_model.train(_X_train%2C%20_y_train)%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20_y_pred%20%3D%20_model.predict(_X_test)%0A%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20elif%20_model_name%20%3D%3D%20%22rf%22%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20_model%20%3D%20RandomForestModel(pred_type%3D%22regression%22)%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20_model.train(_X_train%2C%20_y_train)%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20_y_pred%20%3D%20_model.predict(_X_test)%0A%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20elif%20_model_name%20%3D%3D%20%22gbm%22%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20_model%20%3D%20BoostedTreesModel(pred_type%3D%22regression%22)%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20_model.train(_X_train%2C%20_y_train%2C%20_X_val%2C%20_y_val)%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20_y_pred%20%3D%20_model.predict(_X_test)%0A%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20elif%20_model_name%20%3D%3D%20%22chemprop%22%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20_model%20%3D%20ChempropModel(pred_type%3D%22regression%22)%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20_model.train(_smi_train%2C%20_y_train%2C%20_smi_val%2C%20_y_val%2C%20target_col%3D_TARGET_COL)%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20_y_pred%20%3D%20_model.predict(_smi_test)%0A%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20elif%20_model_name%20%3D%3D%20%22chemeleon%22%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20_model%20%3D%20ChempropChemeleonModel(pred_type%3D%22regression%22)%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20_model.train(_smi_train%2C%20_y_train%2C%20_smi_val%2C%20_y_val%2C%20target_col%3D_TARGET_COL)%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20_y_pred%20%3D%20_model.predict(_smi_test)%0A%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%23%20Free%20model%20memory%20before%20accumulating%20results%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20del%20_model%0A%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%23%20Accumulate%20one%20row%20per%20test%20compound%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20for%20_ik%2C%20_mn%2C%20_smi%2C%20_yt%2C%20_yp%20in%20zip(%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20_test_raw%5B%22inchikey%22%5D.to_list()%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20_test_raw%5B%22molecule_names%22%5D.to_list()%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20_test_raw%5B%22smiles%22%5D.to_list()%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20_y_true.tolist()%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20_y_pred.tolist()%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20)%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20_all_records.append(%7B%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%22inchikey%22%3A%20%20%20%20%20%20%20_ik%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%22molecule_names%22%3A%20_mn%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%22smiles%22%3A%20%20%20%20%20%20%20%20%20_smi%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%22fold%22%3A%20%20%20%20%20%20%20%20%20%20%20_fold%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%22outer_fold%22%3A%20%20%20%20%20_outer%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%22inner_fold%22%3A%20%20%20%20%20_inner%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%22model%22%3A%20%20%20%20%20%20%20%20%20%20_model_name%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%22y_true%22%3A%20%20%20%20%20%20%20%20%20_yt%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%22y_pred%22%3A%20%20%20%20%20%20%20%20%20_yp%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%7D)%0A%0A%20%20%20%20%20%20%20%20%20%20%20%20%23%20Release%20per-fold%20fingerprint%20arrays%20before%20the%20next%20fold%0A%20%20%20%20%20%20%20%20%20%20%20%20del%20_train_fp%2C%20_val_fp%2C%20_test_fp%2C%20_X_train%2C%20_X_val%2C%20_X_test%0A%20%20%20%20%20%20%20%20%20%20%20%20gc.collect()%0A%0A%20%20%20%20%20%20%20%20%23%20%E2%94%80%E2%94%80%20write%20predictions%20(gzip-compressed%20directly)%20%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%0A%20%20%20%20%20%20%20%20_pred_df%20%3D%20pl.DataFrame(_all_records)%0A%20%20%20%20%20%20%20%20_PRED_PATH_GZ.parent.mkdir(parents%3DTrue%2C%20exist_ok%3DTrue)%0A%20%20%20%20%20%20%20%20with%20gzip.open(_PRED_PATH_GZ%2C%20%22wb%22)%20as%20_f%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20_pred_df.write_csv(_f)%0A%20%20%20%20%20%20%20%20print(f%22%5CnSaved%20%7Blen(_pred_df)%3A%2C%7D%20prediction%20rows%20%E2%86%92%20%7B_PRED_PATH_GZ%7D%22)%0A%0A%20%20%20%20_pred_df%0A%20%20%20%20return%0A%0A%0A%40app.cell%0Adef%20_(mo)%3A%0A%20%20%20%20mo.md(r%22%22%22%0A%20%20%20%20On%20a%20base%20M4%20Mac%20Mini%2C%20the%20previous%20cell%20took%20around%207h%20to%20run%0A%20%20%20%20%22%22%22)%0A%20%20%20%20return%0A%0A%0A%40app.cell%0Adef%20_(mo)%3A%0A%20%20%20%20mo.md(r%22%22%22%0A%20%20%20%20%23%20Model%20comparison%20analysis%0A%0A%20%20%20%20We%20follow%20the%20approach%20from%0A%20%20%20%20%5Bpolaris-hub%2Fpolaris-method-comparison%5D(https%3A%2F%2Fgithub.com%2Fpolaris-hub%2Fpolaris-method-comparison)%3A%0A%0A%20%20%20%20%3E%20**Never%20compare%20models%20using%20only%20the%20mean%20of%20a%20metric%20over%20folds.**%0A%20%20%20%20%3E%20Distributions%20carry%20the%20information%20needed%20to%20assess%20statistical%20significance.%0A%0A%20%20%20%20The%20workflow%20is%3A%0A%20%20%20%201.%20Load%20predictions%20%E2%86%92%20compute%20per-fold%20regression%20metrics%0A%20%20%20%202.%20Inspect%20normality%20of%20the%20residuals%20(histogram%20%2B%20QQ%20plot)%0A%20%20%20%203.%20If%20normally%20distributed%3A%20**repeated-measures%20ANOVA%20%2B%20Tukey%20HSD**%0A%20%20%20%204.%20If%20not%3A%20**Friedman%20test**%20(non-parametric%20equivalent)%0A%20%20%20%205.%20Visualise%20results%3A%20boxplots%2C%20confidence-interval%20plots%2C%20multiple-comparison%20heatmaps%0A%20%20%20%20%22%22%22)%0A%20%20%20%20return%0A%0A%0A%40app.cell%0Adef%20_(pl)%3A%0A%20%20%20%20%22%22%22Load%20the%20saved%205%C3%975%20CV%20predictions%20and%20rename%20the%20model%20column%20to%20'method'%0A%20%20%20%20so%20it%20matches%20the%20column%20name%20expected%20by%20all%20comparison%20functions.%22%22%22%0A%20%20%20%20pred_df%20%3D%20(%0A%20%20%20%20%20%20%20%20pl.read_csv(%22..%2Fpredictions%2F2_ml_baseline_5x5cv_random_predictions.csv.gz%22)%0A%20%20%20%20%20%20%20%20.rename(%7B%22model%22%3A%20%22method%22%2C%20%22fold%22%3A%20%22cv_cycle%22%7D)%0A%20%20%20%20%20%20%20%20.with_columns(pl.lit(%22random%22).alias(%22split%22))%0A%20%20%20%20)%0A%20%20%20%20pred_df%0A%20%20%20%20return%20(pred_df%2C)%0A%0A%0A%40app.cell%0Adef%20_(calc_regression_metrics%2C%20pred_df)%3A%0A%20%20%20%20%22%22%22Compute%20MAE%2C%20MSE%2C%20R%C2%B2%2C%20Spearman%20%CF%81%20(rho)%2C%20precision%20and%20recall%20for%20each%0A%20%20%20%20(cv_cycle%2C%20method)%20group.%20%22%22%22%0A%20%20%20%20THRESH%20%3D%204.0%0A%0A%20%20%20%20metrics_df%20%3D%20calc_regression_metrics(%0A%20%20%20%20%20%20%20%20pred_df%2C%0A%20%20%20%20%20%20%20%20cycle_col%3D%22cv_cycle%22%2C%0A%20%20%20%20%20%20%20%20val_col%3D%22y_true%22%2C%0A%20%20%20%20%20%20%20%20pred_col%3D%22y_pred%22%2C%0A%20%20%20%20%20%20%20%20thresh%3DTHRESH%2C%0A%20%20%20%20)%0A%20%20%20%20metrics_df%0A%20%20%20%20return%20THRESH%2C%20metrics_df%0A%0A%0A%40app.cell%0Adef%20_(mo)%3A%0A%20%20%20%20mo.md(r%22%22%22%0A%20%20%20%20%23%23%20Step%201%20%E2%80%94%20Scatter%20plots%3A%20predicted%20vs%20measured%0A%0A%20%20%20%20Each%20panel%20shows%20one%20model's%20predictions%20pooled%20across%20all%2025%20folds.%0A%20%20%20%20The%20diagonal%20dashed%20line%20is%20the%20identity%20(perfect%20prediction)%3B%20red%20dashed%20lines%0A%20%20%20%20mark%20the%20activity%20threshold.%20%20Metric%20values%20shown%20are%20fold-averaged.%0A%20%20%20%20%22%22%22)%0A%20%20%20%20return%0A%0A%0A%40app.cell%0Adef%20_(Path%2C%20THRESH%2C%20make_scatterplot%2C%20mo%2C%20pred_df)%3A%0A%20%20%20%20PLOT_DIR%20%3D%20Path(%22..%2Fplots%2F2_ml_baseline%22)%0A%20%20%20%20PLOT_DIR.mkdir(parents%3DTrue%2C%20exist_ok%3DTrue)%0A%20%20%20%20mo.as_html(make_scatterplot(%0A%20%20%20%20%20%20%20%20pred_df%2C%0A%20%20%20%20%20%20%20%20val_col%3D%22y_true%22%2C%0A%20%20%20%20%20%20%20%20pred_col%3D%22y_pred%22%2C%0A%20%20%20%20%20%20%20%20thresh%3DTHRESH%2C%0A%20%20%20%20%20%20%20%20cycle_col%3D%22cv_cycle%22%2C%0A%20%20%20%20%20%20%20%20group_col%3D%22method%22%2C%0A%20%20%20%20%20%20%20%20save_path%3DPLOT_DIR%20%2F%20%22scatterplot.png%22%2C%0A%20%20%20%20))%0A%20%20%20%20return%20(PLOT_DIR%2C)%0A%0A%0A%40app.cell%0Adef%20_(mo)%3A%0A%20%20%20%20mo.md(r%22%22%22%0A%20%20%20%20%23%23%20Step%202%20%E2%80%94%20Normality%20diagnostics%0A%0A%20%20%20%20Repeated-measures%20ANOVA%20assumes%20that%20the%20**residuals%20within%20each%20method%20group**%0A%20%20%20%20are%20approximately%20normally%20distributed.%20%20We%20assess%20this%20visually%20via%3A%0A%0A%20%20%20%20-%20**Histogram%20%2B%20KDE**%20(top%20row)%20%E2%80%94%20residuals%20should%20look%20bell-shaped%0A%20%20%20%20-%20**Q%E2%80%93Q%20plot**%20(bottom%20row)%20%E2%80%94%20points%20should%20fall%20on%20the%20diagonal%0A%0A%20%20%20%20If%20the%20residuals%20look%20skewed%20or%20heavy-tailed%2C%20use%20the%20Friedman%20test%20instead%0A%20%20%20%20(see%20Step%204).%0A%20%20%20%20%22%22%22)%0A%20%20%20%20return%0A%0A%0A%40app.cell%0Adef%20_(PLOT_DIR%2C%20make_normality_diagnostic%2C%20metrics_df%2C%20mo)%3A%0A%20%20%20%20METRIC_LIST%20%3D%20%5B%22mae%22%2C%20%22mse%22%2C%20%22r2%22%2C%20%22rho%22%5D%0A%0A%20%20%20%20mo.as_html(make_normality_diagnostic(%0A%20%20%20%20%20%20%20%20metrics_df%2C%0A%20%20%20%20%20%20%20%20METRIC_LIST%2C%0A%20%20%20%20%20%20%20%20save_path%3DPLOT_DIR%20%2F%20%22normality_diagnostic.png%22%2C%0A%20%20%20%20))%0A%20%20%20%20return%20(METRIC_LIST%2C)%0A%0A%0A%40app.cell%0Adef%20_(mo)%3A%0A%20%20%20%20mo.md(r%22%22%22%0A%20%20%20%20%23%23%20Step%203%20%E2%80%94%20Boxplots%20with%20repeated-measures%20ANOVA%20p-values%0A%0A%20%20%20%20Each%20box%20shows%20the%20cross-fold%20distribution%20of%20a%20metric%20for%20one%20model.%0A%20%20%20%20The%20title%20of%20each%20panel%20reports%20the%20**repeated-measures%20ANOVA%20p-value**%2C%0A%20%20%20%20which%20tests%20whether%20at%20least%20one%20model%20differs%20significantly%20from%20the%20others%0A%20%20%20%20while%20accounting%20for%20the%20shared%20folds%20(the%20repeated-measures%20structure).%0A%0A%20%20%20%20A%20small%20p-value%20(%3C%200.05)%20means%20the%20models%20are%20not%20equivalent%3B%20see%20the%0A%20%20%20%20Tukey%20HSD%20heatmaps%20below%20for%20pairwise%20comparisons.%0A%20%20%20%20%22%22%22)%0A%20%20%20%20return%0A%0A%0A%40app.cell%0Adef%20_(METRIC_LIST%2C%20PLOT_DIR%2C%20make_boxplots_parametric%2C%20metrics_df%2C%20mo)%3A%0A%0A%20%20%20%20mo.as_html(make_boxplots_parametric(%0A%20%20%20%20%20%20%20%20metrics_df%2C%0A%20%20%20%20%20%20%20%20METRIC_LIST%2C%0A%20%20%20%20%20%20%20%20save_path%3DPLOT_DIR%20%2F%20%22boxplots_parametric.png%22%2C%0A%20%20%20%20))%0A%20%20%20%20return%0A%0A%0A%40app.cell%0Adef%20_(mo)%3A%0A%20%20%20%20mo.md(r%22%22%22%0A%20%20%20%20%23%23%20Step%204%20%E2%80%94%20Boxplots%20with%20Friedman%20test%20p-values%20(non-parametric)%0A%0A%20%20%20%20Same%20layout%20as%20Step%203%20but%20using%20the%20**Friedman%20test**%2C%20which%20makes%20no%20normality%0A%20%20%20%20assumption.%20%20Compare%20these%20p-values%20with%20the%20ANOVA%20p-values%20above%3A%20if%20they%20agree%2C%0A%20%20%20%20the%20ANOVA%20result%20is%20trustworthy%3B%20if%20they%20differ%20substantially%2C%20prefer%20the%20Friedman%0A%20%20%20%20result.%0A%20%20%20%20%22%22%22)%0A%20%20%20%20return%0A%0A%0A%40app.cell%0Adef%20_(METRIC_LIST%2C%20PLOT_DIR%2C%20make_boxplots_nonparametric%2C%20metrics_df%2C%20mo)%3A%0A%0A%20%20%20%20mo.as_html(make_boxplots_nonparametric(%0A%20%20%20%20%20%20%20%20metrics_df%2C%0A%20%20%20%20%20%20%20%20METRIC_LIST%2C%0A%20%20%20%20%20%20%20%20save_path%3DPLOT_DIR%20%2F%20%22boxplots_nonparametric.png%22%2C%0A%20%20%20%20))%0A%20%20%20%20return%0A%0A%0A%40app.cell%0Adef%20_(mo)%3A%0A%20%20%20%20mo.md(r%22%22%22%0A%20%20%20%20%23%23%20Step%205%20%E2%80%94%20Tukey%20HSD%20confidence-interval%20plots%0A%0A%20%20%20%20These%20plots%20show%20pairwise%20mean%20differences%20between%20models%20with%2095%20%25%20simultaneous%0A%20%20%20%20confidence%20intervals%20(Tukey%20HSD%2C%20family-wise%20error%20rate%20%3D%200.05).%0A%0A%20%20%20%20-%20Intervals%20that%20**do%20not%20cross%20zero**%20indicate%20a%20statistically%20significant%0A%20%20%20%20%20%20difference%20between%20that%20pair.%0A%20%20%20%20-%20Intervals%20are%20symmetric%20around%20the%20observed%20mean%20difference%3B%20the%20dashed%20vertical%0A%20%20%20%20%20%20line%20is%20the%20null%20(no%20difference).%0A%20%20%20%20%22%22%22)%0A%20%20%20%20return%0A%0A%0A%40app.cell%0Adef%20_(METRIC_LIST%2C%20PLOT_DIR%2C%20make_ci_plot_grid%2C%20metrics_df%2C%20mo)%3A%0A%0A%20%20%20%20mo.as_html(make_ci_plot_grid(%0A%20%20%20%20%20%20%20%20metrics_df%2C%0A%20%20%20%20%20%20%20%20METRIC_LIST%2C%0A%20%20%20%20%20%20%20%20group_col%3D%22method%22%2C%0A%20%20%20%20%20%20%20%20save_path%3DPLOT_DIR%20%2F%20%22ci_plot_grid.png%22%2C%0A%20%20%20%20))%0A%20%20%20%20return%0A%0A%0A%40app.cell%0Adef%20_(mo)%3A%0A%20%20%20%20mo.md(r%22%22%22%0A%20%20%20%20%23%23%20Step%206%20%E2%80%94%20Multiple-comparison%20heatmaps%20(Tukey%20HSD)%0A%0A%20%20%20%20Each%20cell%20shows%20the%20**mean%20difference**%20between%20the%20row%20model%20and%20the%20column%20model%2C%0A%20%20%20%20annotated%20with%20the%20Tukey-adjusted%20p-value%20significance%20stars%3A%0A%0A%20%20%20%20%7C%20Stars%20%7C%20p-value%20%7C%0A%20%20%20%20%7C---%7C---%7C%0A%20%20%20%20%7C%20%60***%60%20%7C%20%3C%200.001%20%7C%0A%20%20%20%20%7C%20%60**%60%20%20%7C%20%3C%200.01%20%20%7C%0A%20%20%20%20%7C%20%60*%60%20%20%20%7C%20%3C%200.05%20%20%7C%0A%20%20%20%20%7C%20(none)%7C%20%E2%89%A5%200.05%20%20%7C%0A%0A%20%20%20%20The%20colour%20encodes%20the%20direction%20and%20magnitude%20of%20the%20difference.%20%20For%20metrics%20to%0A%20%20%20%20**maximise**%20(R%C2%B2%2C%20%CF%81%2C%20precision%2C%20recall)%20a%20warm%20colour%20(red)%20means%20the%20row%20model%0A%20%20%20%20is%20better%3B%20for%20metrics%20to%20**minimise**%20(MAE%2C%20MSE)%20the%20colourmap%20is%20reversed%20so%0A%20%20%20%20warm%20still%20means%20the%20row%20model%20is%20worse.%0A%20%20%20%20%22%22%22)%0A%20%20%20%20return%0A%0A%0A%40app.cell%0Adef%20_(METRIC_LIST%2C%20PLOT_DIR%2C%20make_mcs_plot_grid%2C%20metrics_df%2C%20mo)%3A%0A%0A%20%20%20%20mo.as_html(make_mcs_plot_grid(%0A%20%20%20%20%20%20%20%20metrics_df%2C%0A%20%20%20%20%20%20%20%20stats%3DMETRIC_LIST%2C%0A%20%20%20%20%20%20%20%20group_col%3D%22method%22%2C%0A%20%20%20%20%20%20%20%20figsize%3D(13%2C%2012)%2C%0A%20%20%20%20%20%20%20%20direction_dict%3D%7B%0A%20%20%20%20%20%20%20%20%20%20%20%20%22r2%22%3A%20%22maximize%22%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20%22rho%22%3A%20%22maximize%22%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20%22prec%22%3A%20%22maximize%22%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20%22recall%22%3A%20%22maximize%22%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20%22mae%22%3A%20%22minimize%22%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20%22mse%22%3A%20%22minimize%22%2C%0A%20%20%20%20%20%20%20%20%7D%2C%0A%20%20%20%20%20%20%20%20effect_dict%3D%7B%0A%20%20%20%20%20%20%20%20%20%20%20%20%22r2%22%3A%200.2%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20%22rho%22%3A%200.2%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20%22prec%22%3A%200.2%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20%22recall%22%3A%200.2%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20%22mae%22%3A%200.5%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20%22mse%22%3A%201.0%2C%0A%20%20%20%20%20%20%20%20%7D%2C%0A%20%20%20%20%20%20%20%20show_diff%3DTrue%2C%0A%20%20%20%20%20%20%20%20sort_axes%3DTrue%2C%0A%20%20%20%20%20%20%20%20save_path%3DPLOT_DIR%20%2F%20%22mcs_plot_grid.png%22%2C%0A%20%20%20%20))%0A%20%20%20%20return%0A%0A%0A%40app.cell%0Adef%20_(mo)%3A%0A%20%20%20%20mo.md(r%22%22%22%0A%20%20%20%20This%20last%20plot%20shows%20the%20Tukey%20HSD%20results%20as%20a%20heatmap%3A%20cell%20colour%20and%20numbers%20inside%20cells%20encode%20effect%20size%2C%20and%20stars%20indicate%20significance%20level.%0A%20%20%20%20For%20our%20comparison%2C%20CheMeleon%20is%20the%20best%20across%20all%20metrics%20and%20against%20all%20other%20models.%0A%20%20%20%20Chemprop%20is%20better%20than%20RF%20and%20GBM%20across%20all%20metrics.%0A%20%20%20%20GBM%20beats%20RF%20only%20on%20one%20metric%2C%20rho.%0A%0A%20%20%20%20It%20is%20not%20surprising%20that%20the%20mean%20and%20NN%20baselines%20were%20much%20worse%20than%20the%20other%20models.%0A%20%20%20%20As%20expected%2C%20the%20R%C2%B2%20of%20the%20mean%20model%20was%200%20and%20rho%20was%20NaN.%0A%20%20%20%20The%20worst-case%20MAE%20was%200.91%2C%20meaning%20predictions%20were%20off%20by%20roughly%20one%20order%20of%20magnitude%20on%20average.%0A%20%20%20%20The%20NN%20model%20performed%20slightly%20better%20in%20terms%20of%20MAE%2C%20but%20R%C2%B2%20was%20slightly%20below%20zero.%0A%0A%20%20%20%20Given%20the%20results%2C%20in%20the%20next%20cells%20we%20will%20train%20a%20Chemeleon%20model%20on%20the%20whole%20training%20set%0A%20%20%20%20and%20predict%20the%20test%20set.%0A%20%20%20%20%22%22%22)%0A%20%20%20%20return%0A%0A%0A%40app.cell%0Adef%20_(mo)%3A%0A%20%20%20%20mo.md(r%22%22%22%0A%20%20%20%20%23%20Train%20Chemeleon%20on%20whole%20train%20dataset%20and%20submit%20predictions%0A%20%20%20%20%22%22%22)%0A%20%20%20%20return%0A%0A%0A%40app.cell%0Adef%20_(ChempropChemeleonModel%2C%20Path%2C%20np%2C%20pl%2C%20whole_train)%3A%0A%20%20%20%20%22%22%22%0A%20%20%20%20Train%20a%20CheMeleon%20model%20on%20the%20entire%20training%20set%20(all%20compounds%20with%20a%20measured%0A%20%20%20%20pEC50_dr)%20and%20generate%20predictions%20for%20the%20513%20held-out%20test%20compounds.%0A%0A%20%20%20%20The%20model%20uses%20a%2010%20%25%20internal%20validation%20split%20drawn%20from%20the%20training%20data%20for%0A%20%20%20%20early%20stopping%20%E2%80%94%20this%20split%20is%20*not*%20the%20competition%20test%20set.%0A%0A%20%20%20%20The%20output%20CSV%20matches%20the%20format%20required%20by%20validate_activity_submission()%3A%0A%20%20%20%20%20%20%20%20SMILES%20%7C%20Molecule%20Name%20%7C%20pEC50%0A%20%20%20%20%22%22%22%0A%0A%0A%20%20%20%20_TARGET_COL%20%20%20%3D%20%22pEC50_dr%22%0A%20%20%20%20_SEED%20%20%20%20%20%20%20%20%20%3D%2042%0A%20%20%20%20_PRED_OUT%20%20%20%20%20%3D%20Path(%22..%2Fpredictions%2F2_ml_baseline_chemeleon_test_submission.csv%22)%0A%0A%20%20%20%20%23%20%E2%94%80%E2%94%80%20load%20data%20%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%0A%20%20%20%20%23%20Test%20set%20loaded%20directly%20from%20the%20most%20recent%20raw%20release%20file.%0A%20%20%20%20%23%20dose_response_test.csv%20already%20has%20columns%20named%20%22SMILES%22%20and%20%22Molecule%20Name%22.%0A%20%20%20%20_test_df%20%3D%20pl.read_csv(%22..%2Fdata%2Fraw%2F20260409%2Fdose_response_test.csv%22)%0A%20%20%20%20%23%20whole_train%20is%20already%20filtered%20to%20rows%20with%20measured%20pEC50_dr%0A%0A%20%20%20%20%23%20%E2%94%80%E2%94%80%20validation%20split%20for%20early%20stopping%20(10%20%25%20of%20train)%20%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%0A%0A%20%20%20%20_rng%20%20%20%20%20%20%3D%20np.random.default_rng(_SEED)%0A%20%20%20%20_n%20%20%20%20%20%20%20%20%3D%20whole_train.shape%5B0%5D%0A%20%20%20%20_val_idx%20%20%3D%20_rng.choice(_n%2C%20size%3Dint(_n%20*%200.1)%2C%20replace%3DFalse)%0A%20%20%20%20_train_idx%20%3D%20np.setdiff1d(np.arange(_n)%2C%20_val_idx)%0A%0A%20%20%20%20_train_sub%20%3D%20whole_train%5B_train_idx%5D%0A%20%20%20%20_val_sub%20%20%20%3D%20whole_train%5B_val_idx%5D%0A%0A%20%20%20%20_X_train%20%3D%20_train_sub%5B%22smiles%22%5D.to_list()%0A%20%20%20%20_y_train%20%3D%20_train_sub%5B_TARGET_COL%5D.to_numpy()%0A%20%20%20%20_X_val%20%20%20%3D%20_val_sub%5B%22smiles%22%5D.to_list()%0A%20%20%20%20_y_val%20%20%20%3D%20_val_sub%5B_TARGET_COL%5D.to_numpy()%0A%20%20%20%20_X_test%20%20%3D%20_test_df%5B%22SMILES%22%5D.to_list()%0A%0A%20%20%20%20%23%20%E2%94%80%E2%94%80%20train%20%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%0A%20%20%20%20if%20_PRED_OUT.exists()%3A%0A%20%20%20%20%20%20%20%20print(f%22Submission%20file%20already%20exists%20at%20%7B_PRED_OUT%7D%20%E2%80%94%20skipping%20training.%22)%0A%20%20%20%20else%3A%0A%20%20%20%20%20%20%20%20_model%20%3D%20ChempropChemeleonModel(pred_type%3D%22regression%22%2C%20epochs%3D50)%0A%20%20%20%20%20%20%20%20_model.train(_X_train%2C%20_y_train%2C%20_X_val%2C%20_y_val%2C%20target_col%3D_TARGET_COL)%0A%0A%20%20%20%20%20%20%20%20%23%20%E2%94%80%E2%94%80%20predict%20and%20build%20submission%20DataFrame%20%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%E2%94%80%0A%20%20%20%20%20%20%20%20_y_pred%20%3D%20_model.predict(_X_test)%0A%0A%20%20%20%20%20%20%20%20_submission%20%3D%20pl.DataFrame(%7B%0A%20%20%20%20%20%20%20%20%20%20%20%20%22SMILES%22%3A%20%20%20%20%20%20%20%20_test_df%5B%22SMILES%22%5D.to_list()%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20%22Molecule%20Name%22%3A%20_test_df%5B%22Molecule%20Name%22%5D.to_list()%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20%22pEC50%22%3A%20%20%20%20%20%20%20%20%20_y_pred.tolist()%2C%0A%20%20%20%20%20%20%20%20%7D)%0A%0A%20%20%20%20%20%20%20%20_PRED_OUT.parent.mkdir(parents%3DTrue%2C%20exist_ok%3DTrue)%0A%20%20%20%20%20%20%20%20_submission.write_csv(_PRED_OUT)%0A%20%20%20%20%20%20%20%20print(f%22Saved%20%7Blen(_submission)%7D%20predictions%20%E2%86%92%20%7B_PRED_OUT%7D%22)%0A%20%20%20%20return%0A%0A%0A%40app.cell%0Adef%20_(Iterable%2C%20Optional%2C%20Path%2C%20mo%2C%20np%2C%20pd)%3A%0A%20%20%20%20%22%22%22%0A%20%20%20%20Validate%20the%20submission%20file%20using%20the%20rules%20from%20the%20OpenADMET%20activity_validation.py%20file%3A%0A%20%20%20%20%20%20-%20Required%20columns%3A%20SMILES%2C%20Molecule%20Name%2C%20pEC50%0A%20%20%20%20%20%20-%20No%20missing%20identifiers%20or%20duplicate%20Molecule%20Names%0A%20%20%20%20%20%20-%20pEC50%20must%20be%20numeric%20and%20finite%0A%20%20%20%20%20%20-%20Exactly%20513%20rows%0A%20%20%20%20%22%22%22%0A%0A%20%20%20%20_PRED_OUT%20%3D%20Path(%22..%2Fpredictions%2F2_ml_baseline_chemeleon_test_submission.csv%22)%0A%0A%20%20%20%20ACTIVITY_DATASET_SIZE%20%3D%20513%0A%0A%0A%20%20%20%20def%20_as_set(values%3A%20Iterable%5Bstr%5D)%20-%3E%20set%5Bstr%5D%3A%0A%20%20%20%20%20%20%20%20return%20%7Bstr(v)%20for%20v%20in%20values%7D%0A%0A%0A%20%20%20%20def%20validate_activity_submission(%0A%20%20%20%20%20%20%20%20activity_predictions_file%3A%20Path%2C%0A%20%20%20%20%20%20%20%20expected_ids%3A%20Optional%5Bset%5Bstr%5D%5D%20%3D%20None%2C%0A%20%20%20%20%20%20%20%20required_id_columns%3A%20tuple%5Bstr%2C%20...%5D%20%3D%20(%22SMILES%22%2C%20%22Molecule%20Name%22)%2C%0A%20%20%20%20%20%20%20%20required_value_columns%3A%20tuple%5Bstr%2C%20...%5D%20%3D%20(%22pEC50%22%2C)%2C%0A%20%20%20%20)%20-%3E%20tuple%5Bbool%2C%20list%5Bstr%5D%5D%3A%0A%20%20%20%20%20%20%20%20errors%3A%20list%5Bstr%5D%20%3D%20%5B%5D%0A%0A%20%20%20%20%20%20%20%20path%20%3D%20Path(activity_predictions_file)%0A%20%20%20%20%20%20%20%20if%20not%20path.exists()%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20return%20False%2C%20%5Bf%22File%20does%20not%20exist%3A%20%7Bpath%7D%22%5D%0A%0A%20%20%20%20%20%20%20%20try%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20activity_predictions%20%3D%20pd.read_csv(path)%0A%20%20%20%20%20%20%20%20except%20Exception%20as%20exc%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20return%20False%2C%20%5Bf%22Error%20reading%20CSV%20file%3A%20%7Bexc%7D%22%5D%0A%0A%20%20%20%20%20%20%20%20required_columns%20%3D%20(*required_id_columns%2C%20*required_value_columns)%0A%20%20%20%20%20%20%20%20missing_columns%20%3D%20%5Bcol%20for%20col%20in%20required_columns%20if%20col%20not%20in%20activity_predictions.columns%5D%0A%20%20%20%20%20%20%20%20if%20missing_columns%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20errors.append(f%22Missing%20required%20column(s)%3A%20%7Bmissing_columns%7D%22)%0A%20%20%20%20%20%20%20%20%20%20%20%20return%20False%2C%20errors%0A%0A%20%20%20%20%20%20%20%20if%20activity_predictions.empty%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20errors.append(%22Submission%20is%20empty.%22)%0A%20%20%20%20%20%20%20%20%20%20%20%20return%20False%2C%20errors%0A%0A%20%20%20%20%20%20%20%20null_id_rows%20%3D%20activity_predictions%5Blist(required_id_columns)%5D.isna().any(axis%3D1).sum()%0A%20%20%20%20%20%20%20%20if%20null_id_rows%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20errors.append(f%22Found%20%7Bnull_id_rows%7D%20row(s)%20with%20missing%20identifier%20values.%22)%0A%0A%20%20%20%20%20%20%20%20if%20%22Molecule%20Name%22%20in%20activity_predictions.columns%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20duplicate_ids%20%3D%20activity_predictions%5B%22Molecule%20Name%22%5D.duplicated().sum()%0A%20%20%20%20%20%20%20%20%20%20%20%20if%20duplicate_ids%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20errors.append(f%22Found%20%7Bduplicate_ids%7D%20duplicated%20'Molecule%20Name'%20value(s).%22)%0A%0A%20%20%20%20%20%20%20%20for%20col%20in%20required_value_columns%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20numeric_col%20%3D%20pd.to_numeric(activity_predictions%5Bcol%5D%2C%20errors%3D%22coerce%22)%0A%20%20%20%20%20%20%20%20%20%20%20%20invalid_numeric%20%3D%20numeric_col.isna().sum()%0A%20%20%20%20%20%20%20%20%20%20%20%20if%20invalid_numeric%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20errors.append(f%22Column%20'%7Bcol%7D'%20contains%20%7Binvalid_numeric%7D%20non-numeric%20or%20missing%20value(s).%22)%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20continue%0A%0A%20%20%20%20%20%20%20%20%20%20%20%20non_finite%20%3D%20(~np.isfinite(numeric_col.to_numpy())).sum()%0A%20%20%20%20%20%20%20%20%20%20%20%20if%20non_finite%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20errors.append(f%22Column%20'%7Bcol%7D'%20contains%20%7Bnon_finite%7D%20non-finite%20value(s)%20(inf%20or%20-inf).%22)%0A%0A%20%20%20%20%20%20%20%20submitted_ids%20%3D%20_as_set(activity_predictions%5B%22Molecule%20Name%22%5D)%0A%20%20%20%20%20%20%20%20if%20expected_ids%20is%20not%20None%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20expected_ids%20%3D%20_as_set(expected_ids)%0A%20%20%20%20%20%20%20%20%20%20%20%20missing%20%3D%20sorted(expected_ids%20-%20submitted_ids)%0A%20%20%20%20%20%20%20%20%20%20%20%20extra%20%3D%20sorted(submitted_ids%20-%20expected_ids)%0A%20%20%20%20%20%20%20%20%20%20%20%20if%20missing%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20errors.append(f%22Missing%20%7Blen(missing)%7D%20expected%20molecule(s)%3A%20%7Bmissing%5B%3A20%5D%7D%22)%0A%20%20%20%20%20%20%20%20%20%20%20%20if%20extra%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20errors.append(f%22Found%20%7Blen(extra)%7D%20unexpected%20molecule(s)%3A%20%7Bextra%5B%3A20%5D%7D%22)%0A%20%20%20%20%20%20%20%20elif%20len(activity_predictions)%20!%3D%20ACTIVITY_DATASET_SIZE%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20errors.append(%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20f%22Submission%20contains%20%7Blen(activity_predictions)%7D%20rows%2C%20expected%20%7BACTIVITY_DATASET_SIZE%7D.%22%0A%20%20%20%20%20%20%20%20%20%20%20%20)%0A%0A%20%20%20%20%20%20%20%20return%20len(errors)%20%3D%3D%200%2C%20errors%0A%0A%20%20%20%20_ok%2C%20_errs%20%3D%20validate_activity_submission(_PRED_OUT)%0A%20%20%20%20if%20_ok%3A%0A%20%20%20%20%20%20%20%20_out%20%3D%20mo.md(f%22**Validation%20passed.**%20%60%7B_PRED_OUT.name%7D%60%20is%20ready%20for%20submission.%22)%0A%20%20%20%20else%3A%0A%20%20%20%20%20%20%20%20_out%20%3D%20mo.md(%22**Validation%20failed%3A**%5Cn%22%20%2B%20%22%5Cn%22.join(f%22-%20%7Be%7D%22%20for%20e%20in%20_errs))%0A%20%20%20%20_out%0A%20%20%20%20return%0A%0A%0A%40app.cell(hide_code%3DTrue)%0Adef%20_(mo)%3A%0A%20%20%20%20mo.md(r%22%22%22%0A%20%20%20%20I%20submitted%20the%20file%20using%20the%20Web%20GUI.%20At%20the%20time%20of%20submission%2C%20I%20was%20rank%2057%20of%2084%2C%20just%20above%20the%20baseline%20provided%20by%20OpenADMET%20using%20LGBM.%0A%20%20%20%20The%20provided%20results%20are%3A%0A%0A%20%20%20%20%7C%20Data%20%7C%20MAE%20%7C%20R2%20%7C%20rho%20%7C%0A%20%20%20%20%7C%20---%20%20%7C%20---%20%7C%20---%20%7C%20---%20%7C%0A%20%20%20%20%7C%20Test%20%7C%200.5738%20%7C%200.3355%20%7C%200.7084%20%7C%0A%20%20%20%20%7C%20CV%20%20%7C%200.50%20%7C%20%200.62%20%7C%200.75%20%7C%0A%0A%0A%20%20%20%20Comparing%20these%20results%20to%20the%20CV%20estimates%2C%20we%20see%20a%20large%20drop%20in%20R%C2%B2%2C%20while%20rho%20and%20MAE%20degrade%20more%20moderately.%0A%20%20%20%20This%20suggests%20meaningful%20differences%20in%20the%20distribution%20of%20activity%20values%20between%20the%20CV%20and%20test%20sets%2C%20either%20in%20the%20predicted%20or%20observed%20distributions.%0A%20%20%20%20%22%22%22)%0A%20%20%20%20return%0A%0A%0Aif%20__name__%20%3D%3D%20%22__main__%22%3A%0A%20%20%20%20app.run()%0A