"""Proportion-weighted K-Nearest Neighbor Classifier"""
# Authors: Alberto Castaño <bertocast@gmail.com>
# Pablo González <gonzalezgpablo@uniovi.es>
# Jaime Alonso <jalonso@uniovi.es>
# Pablo Pérez <pabloperez@uniovi.es>
# Juan José del Coz <juanjo@uniovi.es>
# License: GPLv3 clause, University of Oviedo
import numpy as np
import warnings
from scipy import stats
from sklearn.utils.extmath import weighted_mode
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.neighbors import KNeighborsClassifier
[docs]
class PWK(BaseEstimator, ClassifierMixin):
"""Proportion-weighted k-Nearest Neighbor Classifier
This class is an kind of wrapper of sklearn.neighbors.KNeighborsClassifier (version 1.0.2) to use
class-dependent weights to deal with imbalanced problems. The parameters are the same, except weights
that are computed by this class
Parameters
----------
n_neighbors : int, (default=10)
Number of neighbors to use by default for :meth:`kneighbors` queries.
algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto'
Algorithm used to compute the nearest neighbors:
- 'ball_tree' will use :class:`BallTree`
- 'kd_tree' will use :class:`KDTree`
- 'brute' will use a brute-force search
- 'auto' will attempt to decide the most appropriate algorithm based on the values passed to :meth:`fit` method.
Note: fitting on sparse input will override the setting of this parameter, using brute force.
leaf_size : int, default=30
Leaf size passed to BallTree or KDTree. This can affect the
speed of the construction and query, as well as the memory
required to store the tree. The optimal value depends on the
nature of the problem.
p : int, default=2
Power parameter for the Minkowski metric. When p = 1, this is
equivalent to using manhattan_distance (l1), and euclidean_distance
(l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.
metric : str or callable, default='minkowski'
The distance metric to use for the tree. The default metric is
minkowski, and with p=2 is equivalent to the standard Euclidean
metric. For a list of available metrics, see the documentation of
:class:`~sklearn.metrics.DistanceMetric`.
If metric is "precomputed", X is assumed to be a distance matrix and
must be square during fit. X may be a :term:`sparse graph`,
in which case only "nonzero" elements may be considered neighbors.
metric_params : dict, default=None
Additional keyword arguments for the metric function.
n_jobs : int, default=None
The number of parallel jobs to run for neighbors search.
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
for more details.
Doesn't affect :meth:`fit` method.
Attributes
----------
knn_ : KNeighborsClassifier object
KNN classifier
classes_ : array of shape (n_classes,)
Class labels known to the classifier
weights_ : array, shape (n_samples, )
The weight for each example
y_ : array
True labels
"""
def __init__(self, n_neighbors=10, algorithm='auto', leaf_size=30, p=2, metric='minkowski', metric_params=None,
n_jobs=None):
self.knn_ = KNeighborsClassifier(n_neighbors=n_neighbors, algorithm=algorithm, leaf_size=leaf_size,
metric=metric, p=p, metric_params=metric_params, n_jobs=n_jobs)
self.classes_ = None
self.weights_ = None
self.y_ = None
[docs]
def fit(self, X, y):
""" Fit the k-nearest neighbors classifier and compute the weights using the training dataset
Parameters
----------
X : array-like, shape (n_examples, n_features)
Data
y : array-like, shape (n_examples, )
True classes
"""
self.y_ = y
self.classes_ = np.unique(y)
self.weights_ = np.ones(X.shape[0])
for n_cls, cls in enumerate(self.classes_):
self.weights_[y == cls] = 1 - (np.sum(self.y_ == cls) / X.shape[0])
self.knn_.fit(X, y)
return self
[docs]
def predict(self, X):
""" Returns the crisp predictions for the provided data
Parameters
----------
X : array-like, shape (n_examples, n_features)
Test ata
Returns
-------
preds : array-like, shape shape(n_examples, )
Crisp predictions for the examples in X
"""
_, neigh_ind = self.knn_.kneighbors(X)
mode, _ = weighted_mode(self.y_[neigh_ind], self.weights_[neigh_ind], axis=1)
y_pred = np.asarray(mode.ravel())
return y_pred
[docs]
def predict_proba(self, X):
""" Returns the probabilistic predictions for the provided data
Parameters
----------
X : array-like, shape (n_examples, n_features)
Test ata
Returns
-------
preds : array-like, shape shape(n_examples, n_classes)
Probabilistic predictions for the examples in X
"""
_, neigh_ind = self.knn_.kneighbors(X)
probabilities = np.zeros((X.shape[0], len(self.classes_)))
for i in range(X.shape[0]):
kneigh_classes = self.y_[neigh_ind[i, :]]
for n_cls, cls in enumerate(self.classes_):
probabilities[i, n_cls] = np.sum(self.weights_[neigh_ind[i, kneigh_classes == cls]])
# compute actual probabilities given the weights for each class
normalizer = probabilities.sum(axis=1)[:, np.newaxis]
normalizer[normalizer == 0.0] = 1.0
probabilities /= normalizer
return probabilities