Dear DeepChem community,
I am trying to tune hyperparameters for my multiclassification models.
There are four classes in in y label.
The data is like:
CAS Number Chemical Name Level SMILES
0 51285 2,4-Dinitrophenol 2 Oc1ccc(cc1[N+]([O-])=O)[N+]([O-])=O
1 52517 2-Bromo-2-nitro-1,3-propanediol 0 OCC(Br)(CO)[N+]([O-])=O
My code for hyperparameter tuning for multiclassfication model looks like this
from deepchem.utils.data_utils import load_csv_files
tasks=['Level']
featurizer = dc.feat.CircularFingerprint(size=1024)
dataset_file= "CASNameLevelSmilesLast.csv"
loader = dc.data.CSVLoader(tasks=tasks, feature_field="SMILES", featurizer=featurizer)
dataset = loader.create_dataset(dataset_file)
transformer=dc.trans.BalancingTransformer(dataset=dataset)
dataset = transformer.transform(dataset)
train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split(dataset)
metric = dc.metrics.Metric(sklearn.metrics.roc_auc_score, np.mean)
def rf_model_builder(n_estimators, model_dir):
sklearn_model = RandomForestClassifier(class_weight="balanced", n_estimators=n_estimators)
return dc.models.SklearnModel(sklearn_model, model_dir)
params_dict = {
"n_estimators": [10, 100],
}
optimizer = dc.hyper.GridHyperparamOpt(rf_model_builder)
best_rf, best_rf_hyperparams, all_rf_results = optimizer.hyperparam_search(
params_dict, train_dataset, valid_dataset, metric=metric)
but I got an ValueError: y has more than n_class unique elements.
The whole log is:
ValueError Traceback (most recent call last)
Input In [137], in <cell line: 9>()
4 params_dict = {
5 "n_estimators": [10, 100],
6 }
8 optimizer = dc.hyper.GridHyperparamOpt(rf_model_builder)
----> 9 best_rf, best_rf_hyperparams, all_rf_results = optimizer.hyperparam_search(
10 params_dict, train_dataset, valid_dataset, metric=metric)
File D:\Software\installed\anaconda\envs\deepchem\lib\site-packages\deepchem\hyper\grid_search.py:202, in GridHyperparamOpt.hyperparam_search(self, params_dict, train_dataset, valid_dataset, metric, output_transformers, nb_epoch, use_max, logdir, logfile, **kwargs)
199 except NotImplementedError:
200 pass
--> 202 multitask_scores = model.evaluate(valid_dataset, [metric],
203 output_transformers)
204 valid_score = multitask_scores[metric.name]
205 all_scores[hp_str] = valid_score
File D:\Software\installed\anaconda\envs\deepchem\lib\site-packages\deepchem\models\models.py:215, in Model.evaluate(self, dataset, metrics, transformers, per_task_metrics, use_sample_weights, n_classes)
165 """
166 Evaluates the performance of this model on specified dataset.
167
(...)
212 separately.
213 """
214 evaluator = Evaluator(self, dataset, transformers)
--> 215 return evaluator.compute_model_performance(
216 metrics,
217 per_task_metrics=per_task_metrics,
218 use_sample_weights=use_sample_weights,
219 n_classes=n_classes)
File D:\Software\installed\anaconda\envs\deepchem\lib\site-packages\deepchem\utils\evaluate.py:309, in Evaluator.compute_model_performance(self, metrics, csv_out, stats_out, per_task_metrics, use_sample_weights, n_classes)
307 # Compute multitask metrics
308 for metric in metrics:
--> 309 results = metric.compute_metric(
310 y,
311 y_pred,
312 w,
313 per_task_metrics=per_task_metrics,
314 n_tasks=n_tasks,
315 n_classes=n_classes,
316 use_sample_weights=use_sample_weights)
317 if per_task_metrics:
318 multitask_scores[metric.name], computed_metrics = results
File D:\Software\installed\anaconda\envs\deepchem\lib\site-packages\deepchem\metrics\metric.py:621, in Metric.compute_metric(self, y_true, y_pred, w, n_tasks, n_classes, per_task_metrics, use_sample_weights, **kwargs)
617 # check whether n_tasks is int or not
618 # This is because `normalize_weight_shape` require int value.
619 assert isinstance(n_tasks, int)
--> 621 y_true_arr = normalize_labels_shape(
622 y_true_arr, mode=self.mode, n_tasks=n_tasks, n_classes=n_classes)
623 y_pred_arr = normalize_prediction_shape(
624 y_pred_arr, mode=self.mode, n_tasks=n_tasks, n_classes=n_classes)
625 if self.mode == "classification":
File D:\Software\installed\anaconda\envs\deepchem\lib\site-packages\deepchem\metrics\metric.py:174, in normalize_labels_shape(y, mode, n_tasks, n_classes)
172 # check whether n_classes is int or not
173 assert isinstance(n_classes, int)
--> 174 y_hot = to_one_hot(y_task, n_classes=n_classes)
175 y_hot = np.expand_dims(y_hot, 1)
176 all_y_task.append(y_hot)
File D:\Software\installed\anaconda\envs\deepchem\lib\site-packages\deepchem\metrics\metric.py:395, in to_one_hot(y, n_classes)
393 raise ValueError("y must be a vector of shape (N,) or (N, 1)")
394 if len(np.unique(y)) > n_classes:
--> 395 raise ValueError("y has more than n_class unique elements.")
396 N = np.shape(y)[0]
397 y_hot = np.zeros((N, n_classes))
ValueError: y has more than n_class unique elements.
Any hint would be helpful!