Spaces:
Sleeping
Sleeping
File size: 1,775 Bytes
35189e2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 |
"""
This files includes a the data processing for Tox21.
As an input it takes a list of SMILES and it outputs a nested dictionary with
SMILES and target names as keys.
"""
import os
import json
import argparse
import numpy as np
from src.preprocess import create_descriptors, get_tox21_split
from src.utils import TASKS, HF_TOKEN, create_dir, normalize_config
parser = argparse.ArgumentParser(
description="Data preprocessing script for the Tox21 dataset"
)
parser.add_argument(
"--config",
type=str,
default="config/config.json",
)
def main(config):
"""Create molecule descriptors for HF Tox21 dataset"""
ds = get_tox21_split(HF_TOKEN, cvfold=config["cvfold"])
splits = ["train", "validation"]
for split in splits:
print(f"Preprocess {split} molecules")
ds_split = ds[split]
smiles = list(ds_split["smiles"])
features, clean_mol_mask = create_descriptors(
smiles, config["descriptors"], **config["ecfp"]
)
labels = []
for task in TASKS:
labels.append(ds_split[task].to_numpy())
labels = np.stack(labels, axis=1)
save_path = os.path.join(config["data_folder"], f"tox21_{split}_cv4.npz")
with open(save_path, "wb") as f:
np.savez(
f,
clean_mol_mask=clean_mol_mask,
labels=labels,
**features,
)
print(f"Saved preprocessed {split} split under {save_path}")
print("Preprocessing finished successfully")
if __name__ == "__main__":
args = parser.parse_args()
with open(args.config, "r") as f:
config = json.load(f)
config = normalize_config(config)
create_dir(config["data_folder"])
main(config)
|