"""
Score functions and loss functions for multiclass quantification problems
"""
# Authors: Alberto Castaño <bertocast@gmail.com>
# Pablo González <gonzalezgpablo@uniovi.es>
# Jaime Alonso <jalonso@uniovi.es>
# Pablo Pérez <pabloperez@uniovi.es>
# Juan José del Coz <juanjo@uniovi.es>
# License: GPLv3 clause, University of Oviedo
import numpy as np
import scipy
from sklearn.utils.validation import check_array, check_consistent_length
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.multiclass import unique_labels
[docs]
def check_prevalences(p_true, p_pred):
""" Check that p_true and p_pred are valid and consistent
Parameters
----------
p_true : array_like, shape = (n_classes)
True prevalences
p_pred : array_like, shape = (n_classes)
Predicted prevalences
Returns
-------
p_true : array-like of shape = (n_classes, 1)
The converted and validated p_true array
p_pred : array-like of shape = (n_classes, 1)
The converted and validated p_pred array
"""
check_consistent_length(p_true, p_pred)
p_true = check_array(p_true, ensure_2d=False)
p_pred = check_array(p_pred, ensure_2d=False)
if p_true.ndim == 1:
p_true = p_true.reshape((-1, 1))
if p_pred.ndim == 1:
p_pred = p_pred.reshape((-1, 1))
if p_true.shape[1] != p_pred.shape[1]:
raise ValueError("p_true and p_pred have different length")
return p_true, p_pred
[docs]
def kld(p_true, p_pred, eps=1e-12):
""" Kullback-Leiber divergence (KLD)
:math:`kld = \sum_{j=1}^{j=l} p_j \cdot \log{p_j/\hat{p}_j}`
being `l` the number of classes.
Also known as discrimination information, relative entropy or normalized cross-entropy
(see [Esuli and Sebastiani 2010; Forman 2008]).
KLD is a special case of the family of f-divergences
Parameters
----------
p_true : array_like, shape = (n_classes)
True prevalences
p_pred : array_like, shape = (n_classes)
Predicted prevalences
eps : float,
To prevent division by 0 and Inf/NaN quant_results
Returns
-------
KLD : float
The Kullback-Leiber divergence
"""
p_true, p_pred = check_prevalences(p_true, p_pred)
return np.sum(p_true * np.log2(p_true / (p_pred + eps)))
[docs]
def mean_absolute_error(p_true, p_pred):
""" Mean absolute error
:math:`mae = 1/l \sum_{j=1}^{j=l} | p_j - \hat{p}_j |`
being `l` the number of classes
Parameters
----------
p_true : array_like, shape = (n_classes)
True prevalences
p_pred : array_like, shape = (n_classes)
Predicted prevalences
Returns
-------
MAE : float
The mean absolute error
"""
p_true, p_pred = check_prevalences(p_true, p_pred)
return np.mean(np.abs(p_pred - p_true))
[docs]
def l1(p_true, p_pred):
""" L1 loss function
:math:`l1 = \sum_{j=1}^{j=l} | p_j - \hat{p}_j |`
being `l` the number of classes
Parameters
----------
p_true : array_like, shape = (n_classes)
True prevalences
p_pred : array_like, shape = (n_classes)
Predicted prevalences
Returns
-------
l1 : float
The L1 lost
"""
p_true, p_pred = check_prevalences(p_true, p_pred)
return np.sum(np.abs(p_true - p_pred))
[docs]
def mean_squared_error(p_true, p_pred):
""" Mean squared error
:math:`mse = 1/l \sum_{j=1}^{j=l} (p_j - \hat{p}_j)^2`
being `l` the number of classes
Parameters
----------
p_true : array_like, shape = (n_classes)
True prevalences
p_pred : array_like, shape = (n_classes)
Predicted prevalences
Returns
-------
MSE : float
The mean squared error
"""
p_true, p_pred = check_prevalences(p_true, p_pred)
return np.mean((p_pred - p_true)**2)
[docs]
def l2(p_true, p_pred):
""" L2 loss function
:math:`l2 = \sqrt{\sum_{j=1}^{j=l} (p_j - \hat{p}_j)^2}`
being `l` the number of classes
Parameters
----------
p_true : array_like, shape = (n_classes)
True prevalences
p_pred : array_like, shape = (n_classes)
Predicted prevalences
Returns
-------
l2 : float
The L2 loss
"""
p_true, p_pred = check_prevalences(p_true, p_pred)
return np.sqrt(np.sum((p_true - p_pred) ** 2))
[docs]
def hd(p_true, p_pred):
""" Hellinger distance (HD)
:math:`hd = \sqrt{\sum_{j=1}^{j=l} (\sqrt{p_j} - \sqrt{\hat{p}_j}}`
being `l` the number of classes
Parameters
----------
p_true : array_like, shape = (n_classes)
True prevalences
p_pred : array_like, shape = (n_classes)
Predicted prevalences
Returns
-------
HD : float
The Hellinger distance
"""
p_true, p_pred = check_prevalences(p_true, p_pred)
dist = np.sqrt(np.sum((np.sqrt(p_pred) - np.sqrt(p_true)) ** 2))
return dist
[docs]
def bray_curtis(p_true, p_pred):
""" Bray-Curtis dissimilarity
:math:`bcd = \sum_{j=1}^{j=l} | (p_j - \hat{p}_j) | / \sum_{j=1}^{j=l} (p_j + \hat{p}_j)`
being `l` the number of classes
Parameters
----------
p_true : array_like, shape=(n_classes)
True prevalences. In case of binary quantification, this parameter could be a single float value.
p_pred : array_like, shape=(n_classes)
Predicted prevalences. In case of binary quantification, this parameter could be a single float value.
Returns
-------
BCD : float
The Bray-Curtis dissimilarity
"""
p_true, p_pred = check_prevalences(p_true, p_pred)
return np.sum(np.abs(p_true - p_pred)) / np.sum(p_true + p_pred)
[docs]
def topsoe(p_true, p_pred, epsilon=1e-20):
""" Topsoe distance
.. math::
topsoe = \sum_{j=1}^{j=l} (p_j \cdot \log{((2 \cdot p_j + \epsilon)/( p_j + \hat{p}_j + \epsilon))}) +
(\hat{p}_j \cdot \log{((2 \cdot \hat{p}_j + \epsilon)/( p_j + \hat{p}_j + \epsilon))})
being `l` the number of classes.
Parameters
----------
p_true : array_like, shape=(n_classes)
True prevalences. In case of binary quantification, this parameter could be a single float value.
p_pred : array_like, shape=(n_classes)
Predicted prevalences. In case of binary quantification, this parameter could be a single float value.
epsilon: float, optional
To avoid divison by 0
Returns
-------
TOPSOE : float
The Topsoe distance
"""
p_true, p_pred = check_prevalences(p_true, p_pred)
dist = np.sum(p_true*np.log((2*p_true+epsilon)/(p_true+p_pred+epsilon)) +
p_pred*np.log((2*p_pred+epsilon)/(p_true+p_pred+epsilon)))
return dist
[docs]
def jensenshannon(p_true, p_pred, epsilon=1e-20):
""" Jensen-Shannon divergence (a=1/2)
.. math::
jsd = 1/2 \sum_{j=1}^{j=l} p_j \cdot \log{(p_j + \epsilon)} + \hat{p}_j \cdot \log{(\hat{p}_j + \epsilon)} -
(p_j + \hat{p}_j) \cdot \log{((p_j + \hat{p}_j + \epsilon) / 2)}
being `l` the number of classes.
Parameters
----------
p_true : array_like, shape=(n_classes)
True prevalences. In case of binary quantification, this parameter could be a single float value.
p_pred : array_like, shape=(n_classes)
Predicted prevalences. In case of binary quantification, this parameter could be a single float value.
epsilon: float, optional
To avoid divison by 0
Returns
-------
JSD : float
The Jensen-Shannon divergence
"""
p_true, p_pred = check_prevalences(p_true, p_pred)
dist = np.sum(p_true*np.log(p_true+epsilon)+p_pred*np.log(p_pred+epsilon)-
(p_true+p_pred)*np.log((p_true+p_pred+epsilon)/2))/2
return dist
[docs]
def probsymmetric(p_true, p_pred, epsilon=1e-20):
""" Probabilistic Symmetric distance
:math:`psd = 2 \cdot \sum_{j=1}^{j=l} (p_j - \hat{p}_j)^2 / (p_j + \hat{p}_j + \epsilon)`
being `l` the number of classes
Parameters
----------
p_true : array_like, shape=(n_classes)
True prevalences. In case of binary quantification, this parameter could be a single float value.
p_pred : array_like, shape=(n_classes)
Predicted prevalences. In case of binary quantification, this parameter could be a single float value.
epsilon: float, optional
To avoid divison by 0
Returns
-------
PSD : float
The probabilistic symmetric distance
"""
p_true, p_pred = check_prevalences(p_true, p_pred)
dist = 2*np.sum((p_true-p_pred)**2 / (p_pred+p_true+epsilon))
return dist
[docs]
def brier_multi(p_true, p_pred):
""" Brier score (classification) for multiclass problems
.. math::
bsm = 1/n \sum_{i=1}^{i=n} \sum_{j=1}^{j=l} (\hat{p}_{ij} - p_{ij} )^2
being `l` the number of classes and `n` the number of predictions
Parameters
----------
p_true : array_like, shape=(n_classes)
True prevalences. In case of binary quantification, this parameter could be a single float value.
p_pred : array_like, shape=(n_classes)
Predicted prevalences. In case of binary quantification, this parameter could be a single float value.
Returns
-------
BSM : float
The Brier score for multiclass problems
"""
return np.mean(np.sum((p_pred - p_true)**2, axis=1))
[docs]
def geometric_mean(y_true, y_pred, correction=0.0):
"""Compute the geometric mean.
In quantification, the geometric mean is useful to training a classifier for imbalanced problems (a quite common
issue). The geometric mean tries to maximize the accuracy for all classes, their accuracies must be balanced to
obtain a good score for geometric mean. It is computed as the root of the product of classes sensitivity.
The optimal value is 1 and the worst is 0 (this occurs when the accuracy for one class is 0). To dealt with
worst-case for highly multiclass problems, the sensitivity of unrecognized classes can be corrected to a given
value (instead of zero), see correction parameter.
The implementation given here is a simplification of the one provide in imbalanced library.
Parameters
----------
y_true : ndarray, shape (n_examples,)
True class for each example
y_pred : array, shape (n_examples,)
Predicted class returned by a classifier
correction : float, default=0.0
Substitutes sensitivity of unrecognized classes from zero to this value.
Returns
-------
g_mean : float
The geometric mean
"""
labels = unique_labels(y_true, y_pred)
n_labels = len(labels)
le = LabelEncoder()
le.fit(labels)
y_true = le.transform(y_true)
y_pred = le.transform(y_pred)
sorted_labels = le.classes_
tp = y_true == y_pred
tp_bins = y_true[tp]
if len(tp_bins):
tp_sum = np.bincount(tp_bins, weights=None, minlength=len(labels))
else:
true_sum = tp_sum = np.zeros(len(labels))
if len(y_true):
true_sum = np.bincount(y_true, weights=None, minlength=len(labels))
indices = np.searchsorted(sorted_labels, labels[:n_labels])
tp_sum = tp_sum[indices]
true_sum = true_sum[indices]
mask = true_sum == 0.0
true_sum[mask] = 1 # avoid infs/nans
recall = tp_sum / true_sum
recall[mask] = 0
recall[recall == 0] = correction
with np.errstate(divide="ignore", invalid="ignore"):
g_mean = scipy.stats.gmean(recall)
return g_mean
#
#
# def relative_absolute_error(p_true, p_pred, eps=1e-12):
# """The relation between the absolute error and the true prevalence.
#
# Parameters
# ----------
# p_true : array_like, shape=(n_classes)
# True prevalences. In case of binary quantification, this parameter could be a single float value.
#
# p_pred : array_like, shape=(n_classes)
# Predicted prevalences. In case of binary quantification, this parameter could be a single float value.
#
# eps: to prevent division by 0
#
# """
# if np.any(p_true == 0):
# return np.mean((np.abs(p_pred - p_true) + eps) / (p_true + eps))
# return np.mean(np.abs(p_pred - p_true) / p_true)
#
#
# def symmetric_absolute_percentage_error(p_true, p_pred):
# """SAPE. A symmetric version of RAE.
#
# Parameters
# ----------
# p_true : array_like, shape=(n_classes)
# True prevalences. In case of binary quantification, this parameter could be a single float value.
#
# p_pred : array_like, shape=(n_classes)
# Predicted prevalences. In case of binary quantification, this parameter could be a single float value.
# """
# if np.any((p_pred + p_true) == 0):
# raise NotImplementedError
# return np.abs(p_pred - p_true) / (p_pred + p_true)
#
#
#
#
# def normalized_absolute_error(p_true, p_pred):
# """A loss function, ranging from 0 (best) and 1 (worst)
#
# Parameters
# ----------
# p_true : array_like, shape=(n_classes)
# True prevalences. In case of binary quantification, this parameter could be a single float value.
#
# p_pred : array_like, shape=(n_classes)
# Predicted prevalences. In case of binary quantification, this parameter could be a single float value.
# """
# return np.sum(np.abs(p_pred - p_true)) / (2 * (1 - np.min(p_true)))
#
#
#
#
# def normalized_relative_absolute_error(p_true, p_pred):
# """NRAE.
#
# Parameters
# ----------
# p_true : array_like, shape=(n_classes)
# True prevalences. In case of binary quantification, this parameter could be a single float value.
#
# p_pred : array_like, shape=(n_classes)
# Predicted prevalences. In case of binary quantification, this parameter could be a single float value.
# """
# l = p_true.shape[0]
# return relative_absolute_error(p_true, p_pred) / (l - 1 + (1 - np.min(p_true)) / np.min(p_true))