File size: 3,249 Bytes
35189e2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
"""
This files includes a predict function for the Tox21.
As an input it takes a list of SMILES and it outputs a nested dictionary with
SMILES and target names as keys.
"""

# ---------------------------------------------------------------------------------------
# Dependencies
from collections import defaultdict

import numpy as np

import json
import joblib
import torch

from src.model import Tox21SNNClassifier, SNNConfig
from src.preprocess import create_descriptors, FeaturePreprocessor
from src.utils import TASKS, normalize_config

# ---------------------------------------------------------------------------------------
CONFIG_FILE = "./config/config.json"


def predict(
    smiles_list: list[str], default_prediction=0.5
) -> dict[str, dict[str, float]]:
    """Applies the classifier to a list of SMILES strings. Returns prediction=0.0 for
    any molecule that could not be cleaned.

    Args:
        smiles_list (list[str]): list of SMILES strings

    Returns:
        dict: nested prediction dictionary, following {'<smiles>': {'<target>': <pred>}}
    """
    print(f"Received {len(smiles_list)} SMILES strings")
    # preprocessing pipeline
    with open(CONFIG_FILE, "r") as f:
        config = json.load(f)
    config = normalize_config(config)

    features, is_clean = create_descriptors(
        smiles_list, config["descriptors"], **config["ecfp"]
    )
    print(f"Created descriptors for {sum(is_clean)} molecules.")
    print(f"{len(is_clean) - sum(is_clean)} molecules removed during cleaning")

    # setup model
    preprocessor = FeaturePreprocessor(
        feature_selection_config=config["feature_selection"],
        feature_quantilization_config=config["feature_quantilization"],
        descriptors=config["descriptors"],
        max_samples=config["max_samples"],
        scaler=config["scaler"],
    )

    preprocessor_ckpt = joblib.load(config["preprocessor_path"])
    preprocessor.set_state(preprocessor_ckpt["preprocessor"])
    print(f"Loaded preprocessor from {config['preprocessor_path']}")

    features = {descr: array[is_clean] for descr, array in features.items()}
    features = preprocessor.transform(features)

    dataset = torch.utils.data.TensorDataset(torch.FloatTensor(features))
    loader = torch.utils.data.DataLoader(
        dataset, batch_size=256, shuffle=False, num_workers=0
    )

    # setup model
    cfg = SNNConfig(
        hidden_dim=512,
        n_layers=8,
        dropout=0.05,
        layer_form="rect",
        in_features=features.shape[1],
        out_features=12,
    )

    model = Tox21SNNClassifier(cfg)
    model.load_model(config["ckpt_path"])
    model.eval()
    print(f"Loaded model from {config['ckpt_path']}")

    predictions = defaultdict(dict)

    print(f"Create predictions:")
    preds = []
    with torch.no_grad():
        preds = np.concatenate([model.predict(batch[0]) for batch in loader], axis=0)

    for i, target in enumerate(model.tasks):
        target_preds = np.empty_like(is_clean, dtype=float)

        target_preds[~is_clean] = default_prediction
        target_preds[is_clean] = preds[:, i]

        for smiles, pred in zip(smiles_list, target_preds):
            predictions[smiles][target] = float(pred)

    return predictions