Source code for quantificationlib.estimators.weighted_knn

"""Proportion-weighted K-Nearest Neighbor Classifier"""

# Authors: Alberto Castaño <bertocast@gmail.com>
#          Pablo González <gonzalezgpablo@uniovi.es>
#          Jaime Alonso <jalonso@uniovi.es>
#          Pablo Pérez <pabloperez@uniovi.es>
#          Juan José del Coz <juanjo@uniovi.es>
# License: GPLv3 clause, University of Oviedo

import numpy as np
import warnings

from scipy import stats
from sklearn.utils.extmath import weighted_mode

from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.neighbors import KNeighborsClassifier



[docs]
class PWK(BaseEstimator, ClassifierMixin):
    """Proportion-weighted k-Nearest Neighbor Classifier

        This class is an kind of wrapper of sklearn.neighbors.KNeighborsClassifier (version 1.0.2) to use
        class-dependent weights to deal with imbalanced problems. The parameters are the same, except weights
        that are computed by this class

        Parameters
        ----------
        n_neighbors : int, (default=10)
            Number of neighbors to use by default for :meth:`kneighbors` queries.

        algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto'
            Algorithm used to compute the nearest neighbors:
            
            - 'ball_tree' will use :class:`BallTree`
            - 'kd_tree' will use :class:`KDTree`
            - 'brute' will use a brute-force search
            - 'auto' will attempt to decide the most appropriate algorithm based on the values passed to :meth:`fit` method.
            Note: fitting on sparse input will override the setting of this parameter, using brute force.

        leaf_size : int, default=30
            Leaf size passed to BallTree or KDTree.  This can affect the
            speed of the construction and query, as well as the memory
            required to store the tree.  The optimal value depends on the
            nature of the problem.

        p : int, default=2
            Power parameter for the Minkowski metric. When p = 1, this is
            equivalent to using manhattan_distance (l1), and euclidean_distance
            (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.

        metric : str or callable, default='minkowski'
            The distance metric to use for the tree.  The default metric is
            minkowski, and with p=2 is equivalent to the standard Euclidean
            metric. For a list of available metrics, see the documentation of
            :class:`~sklearn.metrics.DistanceMetric`.
            If metric is "precomputed", X is assumed to be a distance matrix and
            must be square during fit. X may be a :term:`sparse graph`,
            in which case only "nonzero" elements may be considered neighbors.

        metric_params : dict, default=None
            Additional keyword arguments for the metric function.

        n_jobs : int, default=None
            The number of parallel jobs to run for neighbors search.
            ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
            ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
            for more details.
            Doesn't affect :meth:`fit` method.

        Attributes
        ----------
        knn_ : KNeighborsClassifier object
            KNN classifier

        classes_ : array of shape (n_classes,)
            Class labels known to the classifier

        weights_ : array, shape (n_samples, )
            The weight for each example

        y_ : array
            True labels

    """
    def __init__(self, n_neighbors=10, algorithm='auto', leaf_size=30, p=2, metric='minkowski', metric_params=None,
                 n_jobs=None):
        self.knn_ = KNeighborsClassifier(n_neighbors=n_neighbors, algorithm=algorithm, leaf_size=leaf_size,
                                         metric=metric, p=p, metric_params=metric_params, n_jobs=n_jobs)
        self.classes_ = None
        self.weights_ = None
        self.y_ = None


[docs]
    def fit(self, X, y):
        """ Fit the k-nearest neighbors classifier and compute the weights using the training dataset

            Parameters
            ----------
            X : array-like, shape (n_examples, n_features)
                Data

            y : array-like, shape (n_examples, )
                True classes
        """
        self.y_ = y
        self.classes_ = np.unique(y)
        self.weights_ = np.ones(X.shape[0])
        for n_cls, cls in enumerate(self.classes_):
            self.weights_[y == cls] = 1 - (np.sum(self.y_ == cls) / X.shape[0])

        self.knn_.fit(X, y)
        return self



[docs]
    def predict(self, X):
        """ Returns the crisp predictions for the provided data

            Parameters
            ----------
            X : array-like, shape (n_examples, n_features)
                Test ata

            Returns
            -------
            preds : array-like, shape shape(n_examples, )
                 Crisp predictions for the examples in X
        """
        _, neigh_ind = self.knn_.kneighbors(X)

        mode, _ = weighted_mode(self.y_[neigh_ind], self.weights_[neigh_ind], axis=1)
        y_pred = np.asarray(mode.ravel())
        return y_pred



[docs]
    def predict_proba(self, X):
        """ Returns the probabilistic predictions for the provided data

            Parameters
            ----------
            X : array-like, shape (n_examples, n_features)
                Test ata

            Returns
            -------
            preds : array-like, shape shape(n_examples, n_classes)
                 Probabilistic predictions for the examples in X
        """
        _, neigh_ind = self.knn_.kneighbors(X)

        probabilities = np.zeros((X.shape[0], len(self.classes_)))
        for i in range(X.shape[0]):
            kneigh_classes = self.y_[neigh_ind[i, :]]
            for n_cls, cls in enumerate(self.classes_):
                probabilities[i, n_cls] = np.sum(self.weights_[neigh_ind[i, kneigh_classes == cls]])

        # compute actual probabilities given the weights for each class
        normalizer = probabilities.sum(axis=1)[:, np.newaxis]
        normalizer[normalizer == 0.0] = 1.0
        probabilities /= normalizer

        return probabilities
Source code for quantificationlib.estimators.weighted_knn

Table of Contents

Source Code