File size: 1,775 Bytes
35189e2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
"""
This files includes a the data processing for Tox21.
As an input it takes a list of SMILES and it outputs a nested dictionary with
SMILES and target names as keys.
"""

import os
import json
import argparse

import numpy as np

from src.preprocess import create_descriptors, get_tox21_split
from src.utils import TASKS, HF_TOKEN, create_dir, normalize_config

parser = argparse.ArgumentParser(
    description="Data preprocessing script for the Tox21 dataset"
)

parser.add_argument(
    "--config",
    type=str,
    default="config/config.json",
)


def main(config):
    """Create molecule descriptors for HF Tox21 dataset"""
    ds = get_tox21_split(HF_TOKEN, cvfold=config["cvfold"])

    splits = ["train", "validation"]
    for split in splits:

        print(f"Preprocess {split} molecules")

        ds_split = ds[split]
        smiles = list(ds_split["smiles"])

        features, clean_mol_mask = create_descriptors(
            smiles, config["descriptors"], **config["ecfp"]
        )

        labels = []
        for task in TASKS:
            labels.append(ds_split[task].to_numpy())
        labels = np.stack(labels, axis=1)

        save_path = os.path.join(config["data_folder"], f"tox21_{split}_cv4.npz")
        with open(save_path, "wb") as f:
            np.savez(
                f,
                clean_mol_mask=clean_mol_mask,
                labels=labels,
                **features,
            )
            print(f"Saved preprocessed {split} split under {save_path}")
    print("Preprocessing finished successfully")


if __name__ == "__main__":
    args = parser.parse_args()

    with open(args.config, "r") as f:
        config = json.load(f)
    config = normalize_config(config)

    create_dir(config["data_folder"])
    main(config)