knncolle_annoy
Annoy nearest neighbors in knncolle
Loading...
Searching...
No Matches
Annoy.hpp
1#ifndef KNNCOLLE_ANNOY_ANNOY_HPP
2#define KNNCOLLE_ANNOY_ANNOY_HPP
3
4#include <vector>
5#include <algorithm>
6#include <memory>
7#include <cstddef>
8#include <cstring>
9#include <stdexcept>
10#include <limits>
11#include <filesystem>
12
13#include "knncolle/knncolle.hpp"
14#include "annoy/annoylib.h"
15#include "annoy/kissrandom.h"
16
17#include "utils.hpp"
18
24namespace knncolle_annoy {
25
29inline static constexpr const char* annoy_prebuilt_save_name = "knncolle_annoy::Annoy";
30
39 int num_trees = 50;
40
46 double search_mult = -1;
47};
48
52template<typename Index_, typename Data_, typename Distance_, typename AnnoyDistance_, typename AnnoyIndex_, typename AnnoyData_, class AnnoyRng_, class AnnoyThreadPolicy_>
53class AnnoyPrebuilt;
54
55template<
56 typename Index_,
57 typename Data_,
58 typename Distance_,
59 class AnnoyDistance_,
60 typename AnnoyIndex_ = Index_,
61 typename AnnoyData_ = float,
62 class AnnoyRng_ = Annoy::Kiss64Random,
63 class AnnoyThreadPolicy_ = Annoy::AnnoyIndexSingleThreadedBuildPolicy
64>
65class AnnoySearcher final : public knncolle::Searcher<Index_, Data_, Distance_> {
66private:
67 const AnnoyPrebuilt<Index_, Data_, Distance_, AnnoyDistance_, AnnoyIndex_, AnnoyData_, AnnoyRng_, AnnoyThreadPolicy_>& my_parent;
68
69 static constexpr bool same_internal_data = std::is_same<Data_, AnnoyData_>::value;
70 typename std::conditional<!same_internal_data, std::vector<AnnoyData_>, bool>::type my_buffer;
71
72 static constexpr bool same_internal_index = std::is_same<Index_, AnnoyIndex_>::value;
73 std::vector<AnnoyIndex_> my_indices;
74
75 static constexpr bool same_internal_distance = std::is_same<Distance_, AnnoyData_>::value;
76 typename std::conditional<!same_internal_distance, std::vector<AnnoyData_>, bool>::type my_distances;
77
78 typedef int SearchKType;
79
80 Index_ my_capped_k = 0;
81
82 SearchKType get_search_k(Index_ k) const {
83 if (my_parent.my_search_mult < 1) {
84 return -1; // instructs Annoy to use k * num_trees.
85 } else if (k <= my_capped_k) {
86 return my_parent.my_search_mult * static_cast<double>(k) + 0.5; // rounded.
87 } else {
88 return std::numeric_limits<SearchKType>::max(); // cap to avoid overflow.
89 }
90 }
91
92public:
93 AnnoySearcher(const AnnoyPrebuilt<Index_, Data_, Distance_, AnnoyDistance_, AnnoyIndex_, AnnoyData_, AnnoyRng_, AnnoyThreadPolicy_>& parent) : my_parent(parent) {
94 if constexpr(!same_internal_data) {
95 sanisizer::resize(my_buffer, my_parent.my_dim);
96 }
97
98 if (my_parent.my_search_mult >= 1) {
99 my_capped_k = static_cast<double>(std::numeric_limits<SearchKType>::max()) / my_parent.my_search_mult;
100 }
101 }
102
103private:
104 std::pair<std::vector<AnnoyIndex_>*, std::vector<AnnoyData_>*> obtain_pointers(std::vector<Index_>* output_indices, std::vector<Distance_>* output_distances, Index_ k) {
105 std::vector<AnnoyIndex_>* icopy_ptr = &my_indices;
106 if (output_indices) {
107 if constexpr(same_internal_index) {
108 icopy_ptr = output_indices;
109 }
110 }
111 icopy_ptr->clear();
112 icopy_ptr->reserve(k);
113
114 std::vector<AnnoyData_>* dcopy_ptr = NULL;
115 if (output_distances) {
116 if constexpr(same_internal_distance) {
117 dcopy_ptr = output_distances;
118 } else {
119 dcopy_ptr = &my_distances;
120 }
121 dcopy_ptr->clear();
122 dcopy_ptr->reserve(k);
123 }
124
125 return std::make_pair(icopy_ptr, dcopy_ptr);
126 }
127
128 template<typename Type_>
129 static void remove_self(std::vector<Type_>& vec, std::size_t at) {
130 if (at < vec.size()) {
131 vec.erase(vec.begin() + at);
132 } else {
133 vec.pop_back();
134 }
135 }
136
137 template<typename Source_, typename Dest_>
138 static void copy_skip_self(const std::vector<Source_>& source, std::vector<Dest_>& dest, std::size_t at) {
139 auto sIt = source.begin();
140 auto end = source.size();
141 dest.clear();
142 dest.reserve(end - 1);
143
144 if (at < end) {
145 dest.insert(dest.end(), sIt, sIt + at);
146 dest.insert(dest.end(), sIt + at + 1, source.end());
147 } else {
148 // Just in case we're full of ties at duplicate points, such that 'c'
149 // is not in the set. Note that, if self_found=false, we must have at
150 // least 'k+2' points for 'c' to not be detected as its own neighbor.
151 // Thus there is no need to worry whether 'end - 1 != k'; we
152 // are guaranteed to return 'k' elements in this case.
153 dest.insert(dest.end(), sIt, sIt + end - 1);
154 }
155 }
156
157public:
158 void search(Index_ i, Index_ k, std::vector<Index_>* output_indices, std::vector<Distance_>* output_distances) {
159 // +1, as it forgets to discard 'self'. This should not overflow as k < num_obs.
160 const auto kp1 = k + 1;
161
162 auto ptrs = obtain_pointers(output_indices, output_distances, kp1);
163 auto icopy_ptr = ptrs.first;
164 auto dcopy_ptr = ptrs.second;
165
166 my_parent.my_index.get_nns_by_item(
167 i,
168 sanisizer::cast<std::size_t>(sanisizer::attest_gez(kp1)),
169 get_search_k(kp1),
170 icopy_ptr,
171 dcopy_ptr
172 );
173
174 std::size_t at;
175 {
176 const auto& cur_i = *icopy_ptr;
177 at = cur_i.size();
178 const AnnoyIndex_ icopy = i; // cast to AnnoyIndex_ to avoid signed/unsigned comparisons.
179 for (std::size_t x = 0, end = cur_i.size(); x < end; ++x) {
180 if (cur_i[x] == icopy) {
181 at = x;
182 break;
183 }
184 }
185 }
186
187 if (output_indices) {
188 if constexpr(same_internal_index) {
189 remove_self(*output_indices, at);
190 } else {
191 copy_skip_self(my_indices, *output_indices, at);
192 }
193 }
194
195 if (output_distances) {
196 if constexpr(same_internal_distance) {
197 remove_self(*output_distances, at);
198 } else {
199 copy_skip_self(my_distances, *output_distances, at);
200 }
201 }
202 }
203
204private:
205 void search_raw(const AnnoyData_* query, Index_ k, std::vector<Index_>* output_indices, std::vector<Distance_>* output_distances) {
206 auto ptrs = obtain_pointers(output_indices, output_distances, k);
207 auto icopy_ptr = ptrs.first;
208 auto dcopy_ptr = ptrs.second;
209
210 my_parent.my_index.get_nns_by_vector(
211 query,
212 sanisizer::cast<std::size_t>(sanisizer::attest_gez(k)),
213 get_search_k(k),
214 icopy_ptr,
215 dcopy_ptr
216 );
217
218 if (output_indices) {
219 if constexpr(!same_internal_index) {
220 output_indices->clear();
221 output_indices->insert(output_indices->end(), my_indices.begin(), my_indices.end());
222 }
223 }
224
225 if (output_distances) {
226 if constexpr(!same_internal_distance) {
227 output_distances->clear();
228 output_distances->insert(output_distances->end(), my_distances.begin(), my_distances.end());
229 }
230 }
231 }
232
233public:
234 void search(const Data_* query, Index_ k, std::vector<Index_>* output_indices, std::vector<Distance_>* output_distances) {
235 if constexpr(same_internal_data) {
236 search_raw(query, k, output_indices, output_distances);
237 } else {
238 std::copy_n(query, my_parent.my_dim, my_buffer.begin());
239 search_raw(my_buffer.data(), k, output_indices, output_distances);
240 }
241 }
242};
243
244template<
245 typename Index_,
246 typename Data_,
247 typename Distance_,
248 class AnnoyDistance_,
249 typename AnnoyIndex_ = Index_,
250 typename AnnoyData_ = float,
251 class AnnoyRng_ = Annoy::Kiss64Random,
252 class AnnoyThreadPolicy_ = Annoy::AnnoyIndexSingleThreadedBuildPolicy
253>
254class AnnoyPrebuilt final : public knncolle::Prebuilt<Index_, Data_, Distance_> {
255public:
256 template<class Matrix_>
257 AnnoyPrebuilt(const Matrix_& data, const AnnoyOptions& options) :
258 my_dim(data.num_dimensions()),
259 my_obs(data.num_observations()),
260 my_search_mult(options.search_mult),
261 my_index(my_dim)
262 {
263 // check that we can, in fact, represent the number of observations in the specified AnnoyIndex_ type.
264 sanisizer::cast<AnnoyIndex_>(sanisizer::attest_gez(my_obs));
265
266 auto work = data.new_known_extractor();
267 if constexpr(std::is_same<Data_, AnnoyData_>::value) {
268 for (Index_ i = 0; i < my_obs; ++i) {
269 auto ptr = work->next();
270 my_index.add_item(i, ptr);
271 }
272 } else {
273 auto incoming = sanisizer::create<std::vector<AnnoyData_> >(my_dim);
274 for (Index_ i = 0; i < my_obs; ++i) {
275 auto ptr = work->next();
276 std::copy_n(ptr, my_dim, incoming.begin());
277 my_index.add_item(i, incoming.data());
278 }
279 }
280
281 my_index.build(options.num_trees);
282 return;
283 }
284
285private:
286 std::size_t my_dim;
287 Index_ my_obs;
288 double my_search_mult;
289
290 // Unfortunately, AnnoyIndex::save is not const (and also unloads and
291 // reloads the index, for reasons I don't understand). So we manually
292 // handle the saving by looking into the protected members.
293 struct SuperHackyThing final : public Annoy::AnnoyIndex<AnnoyIndex_, AnnoyData_, AnnoyDistance_, AnnoyRng_, AnnoyThreadPolicy_> {
294 template<typename ... Args_>
295 SuperHackyThing(Args_&& ... args) : Annoy::AnnoyIndex<AnnoyIndex_, AnnoyData_, AnnoyDistance_, AnnoyRng_, AnnoyThreadPolicy_>(std::forward<Args_>(args)...) {}
296
297 auto get_nodes() const {
298 return this->_nodes;
299 }
300
301 auto get_s() const {
302 return this->_s;
303 }
304
305 auto get_n_nodes() const {
306 return this->_n_nodes;
307 }
308 };
309 SuperHackyThing my_index;
310
311 friend class AnnoySearcher<Index_, Data_, Distance_, AnnoyDistance_, AnnoyIndex_, AnnoyData_, AnnoyRng_, AnnoyThreadPolicy_>;
312
313public:
314 std::size_t num_dimensions() const {
315 return my_dim;
316 }
317
318 Index_ num_observations() const {
319 return my_obs;
320 }
321
322 std::unique_ptr<knncolle::Searcher<Index_, Data_, Distance_> > initialize() const {
323 return initialize_known();
324 }
325
326 auto initialize_known() const {
327 return std::make_unique<AnnoySearcher<Index_, Data_, Distance_, AnnoyDistance_, AnnoyIndex_, AnnoyData_, AnnoyRng_, AnnoyThreadPolicy_> >(*this);
328 }
329
330public:
331 void save(const std::filesystem::path& dir) const {
332 knncolle::quick_save(dir / "ALGORITHM", annoy_prebuilt_save_name, std::strlen(annoy_prebuilt_save_name));
333 knncolle::quick_save(dir / "NUM_OBS", &my_obs, 1);
334 knncolle::quick_save(dir / "NUM_DIM", &my_dim, 1);
335 knncolle::quick_save(dir / "SEARCH_MULT", &my_search_mult, 1);
336
337 knncolle::NumericType types[2];
340 knncolle::quick_save(dir / "TYPES", types, 2);
341
342 const auto dname = get_distance_name<AnnoyDistance_>();
343 knncolle::quick_save(dir / "DISTANCE", dname, std::strlen(dname));
344
345 // Handling custom Annoy types.
346 auto& icust = custom_save_for_annoy_index<AnnoyIndex_>();
347 if (icust) {
348 icust(dir);
349 }
350
351 auto& dcust = custom_save_for_annoy_data<AnnoyData_>();
352 if (dcust) {
353 dcust(dir);
354 }
355
356 auto& dscust = custom_save_for_annoy_distance<AnnoyDistance_>();
357 if (dscust) {
358 dscust(dir);
359 }
360
361 // Not bothering to save anything else; the RNG and thread policy
362 // should not be relevant once the index is built.
363
364 // For reasons unknown to us, AnnoyIndex::save() is not const, so we have to do it manually.
365 // Hopefully this will be fixed in the future.
366 const auto idxpath = dir / "INDEX";
367 knncolle::quick_save(idxpath, reinterpret_cast<char*>(my_index.get_nodes()), sanisizer::product<std::streamsize>(my_index.get_s(), my_index.get_n_nodes()));
368 }
369
370 AnnoyPrebuilt(const std::filesystem::path& dir, std::size_t ndim) : my_dim(ndim), my_index(ndim) {
371 knncolle::quick_load(dir / "NUM_OBS", &my_obs, 1);
372 knncolle::quick_load(dir / "SEARCH_MULT", &my_search_mult, 1);
373
374 const auto idxpath = (dir / "INDEX").string();
375 char* errbuf = NULL;
376 if (!my_index.load(idxpath.c_str(), true, &errbuf)) {
377 std::runtime_error ex(errbuf);
378 delete errbuf;
379 throw ex;
380 }
381 }
382};
414template<
415 typename Index_,
416 typename Data_,
417 typename Distance_,
418 class AnnoyDistance_,
419 typename AnnoyIndex_ = Index_,
420 typename AnnoyData_ = float,
421 class AnnoyRng_ = Annoy::Kiss64Random,
422 class AnnoyThreadPolicy_ = Annoy::AnnoyIndexSingleThreadedBuildPolicy,
423 class Matrix_ = knncolle::Matrix<Index_, Data_>
424>
425class AnnoyBuilder : public knncolle::Builder<Index_, Data_, Distance_, Matrix_> {
426private:
427 AnnoyOptions my_options;
428
429public:
433 AnnoyBuilder(AnnoyOptions options) : my_options(std::move(options)) {}
434
438 AnnoyBuilder() = default;
439
445 return my_options;
446 }
447
448public:
452 knncolle::Prebuilt<Index_, Data_, Distance_>* build_raw(const Matrix_& data) const {
453 return build_known_raw(data);
454 }
459public:
463 auto build_known_raw(const Matrix_& data) const {
464 return new AnnoyPrebuilt<Index_, Data_, Distance_, AnnoyDistance_, AnnoyIndex_, AnnoyData_, AnnoyRng_, AnnoyThreadPolicy_>(data, my_options);
465 }
466
470 auto build_known_unique(const Matrix_& data) const {
471 return std::unique_ptr<I<decltype(*build_known_raw(data))> >(build_known_raw(data));
472 }
473
477 auto build_known_shared(const Matrix_& data) const {
478 return std::shared_ptr<I<decltype(*build_known_raw(data))> >(build_known_raw(data));
479 }
480};
481
482}
483
484#endif
virtual Prebuilt< Index_, Data_, Distance_ > * build_raw(const Matrix_ &data) const=0
Perform an approximate nearest neighbor search with Annoy.
Definition Annoy.hpp:425
auto build_known_unique(const Matrix_ &data) const
Definition Annoy.hpp:470
AnnoyBuilder(AnnoyOptions options)
Definition Annoy.hpp:433
AnnoyOptions & get_options()
Definition Annoy.hpp:444
auto build_known_raw(const Matrix_ &data) const
Definition Annoy.hpp:463
auto build_known_shared(const Matrix_ &data) const
Definition Annoy.hpp:477
Approximate nearest neighbor search with Annoy.
Definition Annoy.hpp:24
void quick_load(const std::filesystem::path &path, Input_ *const contents, const Length_ length)
NumericType get_numeric_type()
void quick_save(const std::filesystem::path &path, const Input_ *const contents, const Length_ length)
Options for AnnoyBuilder().
Definition Annoy.hpp:34
int num_trees
Definition Annoy.hpp:39
double search_mult
Definition Annoy.hpp:46
Utilities for the Annoy wrappers.