template<typename Index_, typename Data_, typename Distance_, class Matrix_ = knncolle::Matrix<Index_, Data_>, class DistanceMetricData_ = knncolle::DistanceMetric<Data_, Distance_>, typename KmeansIndex_ = Index_, typename KmeansData_ = Data_, typename KmeansCluster_ = Index_, typename KmeansFloat_ = Distance_, class KmeansMatrix_ = kmeans::SimpleMatrix<KmeansIndex_, KmeansData_>, class DistanceMetricCenter_ = knncolle::DistanceMetric<KmeansFloat_, Distance_>>
class knncolle_kmknn::KmknnBuilder< Index_, Data_, Distance_, Matrix_, DistanceMetricData_, KmeansIndex_, KmeansData_, KmeansCluster_, KmeansFloat_, KmeansMatrix_, DistanceMetricCenter_ >
Perform a nearest neighbor search based on k-means clustering.
In the k-means with k-nearest neighbors (KMKNN) algorithm (Wang, 2012), k-means clustering is first applied to the data points, with the number of cluster centers defined as the square root of the number of points. The cluster assignment and distance to the assigned cluster center for each point represent the KMKNN indexing information, allowing us to speed up the nearest neighbor search by exploiting the triangle inequality between cluster centers, the query point and each point in the cluster to narrow the search space. The advantage of the KMKNN approach is its simplicity and minimal overhead, resulting in performance improvements over conventional tree-based methods for high-dimensional data where most points need to be searched anyway.
Note that the KMKNN search is theoretically "exact" but the behavior of the implementation will be affected by round-off error for floating-point inputs. See the related discussion in knncolle::VptreeBuilder for more details.
- Template Parameters
-
| Index_ | Integer type for the observation indices. |
| Data_ | Numeric type for the input and query data. |
| Distance_ | Floating-point type for the distances. |
| Matrix_ | Class of the input data matrix. This should satisfy the knncolle::Matrix interface. |
| DistanceMetricData_ | Class implementing the calculation of distances between observations. This should satisfy the knncolle::DistanceMetric interface. |
| KmeansIndex_ | Integer type of the observation indices for kmeans. |
| KmeansData_ | Numeric type of the input data for kmeans. |
| KmeansCluster_ | Integer type of the cluster identities for kmeans. |
| KmeansFloat_ | Floating-point type of the cluster centroids. |
| KmeansMatrix_ | Class of the input data matrix for kmeans. This should satisfy the kmeans::Matrix interface, most typically a kmeans::SimpleMatrix. (Note that this is a different class from the knncolle::Matrix interface!) |
| DistanceMetricCenter_ | Class implementing the calculation of distances between an observation and a cluster centroid. This should satisfy the knncolle::DistanceMetric interface. |
- See also
- Wang X (2012). A fast exact k-nearest neighbors algorithm for high dimensional search using k-means clustering and triangle inequality. Proc Int Jt Conf Neural Netw, 43, 6:2351-2358.
template<typename Index_ , typename Data_ , typename Distance_ , class Matrix_ = knncolle::Matrix<Index_, Data_>, class DistanceMetricData_ = knncolle::DistanceMetric<Data_, Distance_>, typename KmeansIndex_ = Index_, typename KmeansData_ = Data_, typename KmeansCluster_ = Index_, typename KmeansFloat_ = Distance_, class KmeansMatrix_ = kmeans::SimpleMatrix<KmeansIndex_, KmeansData_>, class DistanceMetricCenter_ = knncolle::DistanceMetric<KmeansFloat_, Distance_>>
| KmknnOptions<Index_, Data_, Distance_, KmeansIndex_, KmeansData_, KmeansCluster_, KmeansFloat_, KmeansMatrix_> knncolle_kmknn::KmknnBuilder< Index_, Data_, Distance_, Matrix_, DistanceMetricData_, KmeansIndex_, KmeansData_, KmeansCluster_, KmeansFloat_, KmeansMatrix_, DistanceMetricCenter_ >::Options |
Convenient name for the KmknnOptions class that ensures consistent template parametrization.
template<typename Index_ , typename Data_ , typename Distance_ , class Matrix_ = knncolle::Matrix<Index_, Data_>, class DistanceMetricData_ = knncolle::DistanceMetric<Data_, Distance_>, typename KmeansIndex_ = Index_, typename KmeansData_ = Data_, typename KmeansCluster_ = Index_, typename KmeansFloat_ = Distance_, class KmeansMatrix_ = kmeans::SimpleMatrix<KmeansIndex_, KmeansData_>, class DistanceMetricCenter_ = knncolle::DistanceMetric<KmeansFloat_, Distance_>>
| knncolle_kmknn::KmknnBuilder< Index_, Data_, Distance_, Matrix_, DistanceMetricData_, KmeansIndex_, KmeansData_, KmeansCluster_, KmeansFloat_, KmeansMatrix_, DistanceMetricCenter_ >::KmknnBuilder |
( |
std::shared_ptr< const DistanceMetricData_ > | metric_data, |
|
|
std::shared_ptr< const DistanceMetricCenter_ > | metric_center, |
|
|
Options | options ) |
|
inline |
- Parameters
-
| metric_data | Pointer to a distance metric (e.g., knncolle::EuclideanDistance) for computing distances between observations. |
| metric_center | Pointer to a distance metric (e.g., knncolle::EuclideanDistance) for computing distances between observations and cluster centroids. |
| options | Further options for the KMKNN algorithm. |
metric_center and metric_data should compute the same distance metric. Specifically, the distance computed by metric_data between two Data_ arrays should be the same as the distance computed by metric_center between two KmeansFloat_ arrays with the same contents, assuming that the contents can be losslessly converted from Data_ to KmeansFloat_. This might seem redundant but we accept two separate arguments here to support use cases where Data_ and KmeansFloat_ are different types (e.g., integer Data_ and double-precision KmeansFloat_), where a more efficient calculation of the same distance may be possible with one of the types.
template<typename Index_ , typename Data_ , typename Distance_ , class Matrix_ = knncolle::Matrix<Index_, Data_>, class DistanceMetricData_ = knncolle::DistanceMetric<Data_, Distance_>, typename KmeansIndex_ = Index_, typename KmeansData_ = Data_, typename KmeansCluster_ = Index_, typename KmeansFloat_ = Distance_, class KmeansMatrix_ = kmeans::SimpleMatrix<KmeansIndex_, KmeansData_>, class DistanceMetricCenter_ = knncolle::DistanceMetric<KmeansFloat_, Distance_>>
| auto knncolle_kmknn::KmknnBuilder< Index_, Data_, Distance_, Matrix_, DistanceMetricData_, KmeansIndex_, KmeansData_, KmeansCluster_, KmeansFloat_, KmeansMatrix_, DistanceMetricCenter_ >::build_known_raw |
( |
const Matrix_ & | data | ) |
const |
|
inline |
Override to assist devirtualization.
template<typename Index_ , typename Data_ , typename Distance_ , class Matrix_ = knncolle::Matrix<Index_, Data_>, class DistanceMetricData_ = knncolle::DistanceMetric<Data_, Distance_>, typename KmeansIndex_ = Index_, typename KmeansData_ = Data_, typename KmeansCluster_ = Index_, typename KmeansFloat_ = Distance_, class KmeansMatrix_ = kmeans::SimpleMatrix<KmeansIndex_, KmeansData_>, class DistanceMetricCenter_ = knncolle::DistanceMetric<KmeansFloat_, Distance_>>
| auto knncolle_kmknn::KmknnBuilder< Index_, Data_, Distance_, Matrix_, DistanceMetricData_, KmeansIndex_, KmeansData_, KmeansCluster_, KmeansFloat_, KmeansMatrix_, DistanceMetricCenter_ >::build_known_shared |
( |
const Matrix_ & | data | ) |
const |
|
inline |
Override to assist devirtualization.
template<typename Index_ , typename Data_ , typename Distance_ , class Matrix_ = knncolle::Matrix<Index_, Data_>, class DistanceMetricData_ = knncolle::DistanceMetric<Data_, Distance_>, typename KmeansIndex_ = Index_, typename KmeansData_ = Data_, typename KmeansCluster_ = Index_, typename KmeansFloat_ = Distance_, class KmeansMatrix_ = kmeans::SimpleMatrix<KmeansIndex_, KmeansData_>, class DistanceMetricCenter_ = knncolle::DistanceMetric<KmeansFloat_, Distance_>>
| auto knncolle_kmknn::KmknnBuilder< Index_, Data_, Distance_, Matrix_, DistanceMetricData_, KmeansIndex_, KmeansData_, KmeansCluster_, KmeansFloat_, KmeansMatrix_, DistanceMetricCenter_ >::build_known_unique |
( |
const Matrix_ & | data | ) |
const |
|
inline |
Override to assist devirtualization.