knncolle_annoy
Annoy nearest neighbors in knncolle
Loading...
Searching...
No Matches
Annoy.hpp
1#ifndef KNNCOLLE_ANNOY_ANNOY_HPP
2#define KNNCOLLE_ANNOY_ANNOY_HPP
3
4#include <vector>
5#include <algorithm>
6#include <memory>
7#include <cstddef>
8#include <cstring>
9#include <stdexcept>
10#include <limits>
11
12#include "knncolle/knncolle.hpp"
13#include "annoy/annoylib.h"
14#include "annoy/kissrandom.h"
15
16#include "utils.hpp"
17
23namespace knncolle_annoy {
24
33 int num_trees = 50;
34
40 double search_mult = -1;
41};
42
46template<typename Index_, typename Data_, typename Distance_, typename AnnoyDistance_, typename AnnoyIndex_, typename AnnoyData_, class AnnoyRng_, class AnnoyThreadPolicy_>
47class AnnoyPrebuilt;
48
49template<
50 typename Index_,
51 typename Data_,
52 typename Distance_,
53 class AnnoyDistance_,
54 typename AnnoyIndex_ = Index_,
55 typename AnnoyData_ = float,
56 class AnnoyRng_ = Annoy::Kiss64Random,
57 class AnnoyThreadPolicy_ = Annoy::AnnoyIndexSingleThreadedBuildPolicy
58>
59class AnnoySearcher final : public knncolle::Searcher<Index_, Data_, Distance_> {
60private:
61 const AnnoyPrebuilt<Index_, Data_, Distance_, AnnoyDistance_, AnnoyIndex_, AnnoyData_, AnnoyRng_, AnnoyThreadPolicy_>& my_parent;
62
63 static constexpr bool same_internal_data = std::is_same<Data_, AnnoyData_>::value;
64 typename std::conditional<!same_internal_data, std::vector<AnnoyData_>, bool>::type my_buffer;
65
66 static constexpr bool same_internal_index = std::is_same<Index_, AnnoyIndex_>::value;
67 std::vector<AnnoyIndex_> my_indices;
68
69 static constexpr bool same_internal_distance = std::is_same<Distance_, AnnoyData_>::value;
70 typename std::conditional<!same_internal_distance, std::vector<AnnoyData_>, bool>::type my_distances;
71
72 typedef int SearchKType;
73
74 Index_ my_capped_k = 0;
75
76 SearchKType get_search_k(Index_ k) const {
77 if (my_parent.my_search_mult < 1) {
78 return -1; // instructs Annoy to use k * num_trees.
79 } else if (k <= my_capped_k) {
80 return my_parent.my_search_mult * static_cast<double>(k) + 0.5; // rounded.
81 } else {
82 return std::numeric_limits<SearchKType>::max(); // cap to avoid overflow.
83 }
84 }
85
86public:
87 AnnoySearcher(const AnnoyPrebuilt<Index_, Data_, Distance_, AnnoyDistance_, AnnoyIndex_, AnnoyData_, AnnoyRng_, AnnoyThreadPolicy_>& parent) : my_parent(parent) {
88 if constexpr(!same_internal_data) {
89 sanisizer::resize(my_buffer, my_parent.my_dim);
90 }
91
92 if (my_parent.my_search_mult >= 1) {
93 my_capped_k = static_cast<double>(std::numeric_limits<SearchKType>::max()) / my_parent.my_search_mult;
94 }
95 }
96
97private:
98 std::pair<std::vector<AnnoyIndex_>*, std::vector<AnnoyData_>*> obtain_pointers(std::vector<Index_>* output_indices, std::vector<Distance_>* output_distances, Index_ k) {
99 std::vector<AnnoyIndex_>* icopy_ptr = &my_indices;
100 if (output_indices) {
101 if constexpr(same_internal_index) {
102 icopy_ptr = output_indices;
103 }
104 }
105 icopy_ptr->clear();
106 icopy_ptr->reserve(k);
107
108 std::vector<AnnoyData_>* dcopy_ptr = NULL;
109 if (output_distances) {
110 if constexpr(same_internal_distance) {
111 dcopy_ptr = output_distances;
112 } else {
113 dcopy_ptr = &my_distances;
114 }
115 dcopy_ptr->clear();
116 dcopy_ptr->reserve(k);
117 }
118
119 return std::make_pair(icopy_ptr, dcopy_ptr);
120 }
121
122 template<typename Type_>
123 static void remove_self(std::vector<Type_>& vec, std::size_t at) {
124 if (at < vec.size()) {
125 vec.erase(vec.begin() + at);
126 } else {
127 vec.pop_back();
128 }
129 }
130
131 template<typename Source_, typename Dest_>
132 static void copy_skip_self(const std::vector<Source_>& source, std::vector<Dest_>& dest, std::size_t at) {
133 auto sIt = source.begin();
134 auto end = source.size();
135 dest.clear();
136 dest.reserve(end - 1);
137
138 if (at < end) {
139 dest.insert(dest.end(), sIt, sIt + at);
140 dest.insert(dest.end(), sIt + at + 1, source.end());
141 } else {
142 // Just in case we're full of ties at duplicate points, such that 'c'
143 // is not in the set. Note that, if self_found=false, we must have at
144 // least 'k+2' points for 'c' to not be detected as its own neighbor.
145 // Thus there is no need to worry whether 'end - 1 != k'; we
146 // are guaranteed to return 'k' elements in this case.
147 dest.insert(dest.end(), sIt, sIt + end - 1);
148 }
149 }
150
151public:
152 void search(Index_ i, Index_ k, std::vector<Index_>* output_indices, std::vector<Distance_>* output_distances) {
153 // +1, as it forgets to discard 'self'. This should not overflow as k < num_obs.
154 const auto kp1 = k + 1;
155
156 auto ptrs = obtain_pointers(output_indices, output_distances, kp1);
157 auto icopy_ptr = ptrs.first;
158 auto dcopy_ptr = ptrs.second;
159
160 my_parent.my_index.get_nns_by_item(
161 i,
162 sanisizer::cast<std::size_t>(sanisizer::attest_gez(kp1)),
163 get_search_k(kp1),
164 icopy_ptr,
165 dcopy_ptr
166 );
167
168 std::size_t at;
169 {
170 const auto& cur_i = *icopy_ptr;
171 at = cur_i.size();
172 const AnnoyIndex_ icopy = i; // cast to AnnoyIndex_ to avoid signed/unsigned comparisons.
173 for (std::size_t x = 0, end = cur_i.size(); x < end; ++x) {
174 if (cur_i[x] == icopy) {
175 at = x;
176 break;
177 }
178 }
179 }
180
181 if (output_indices) {
182 if constexpr(same_internal_index) {
183 remove_self(*output_indices, at);
184 } else {
185 copy_skip_self(my_indices, *output_indices, at);
186 }
187 }
188
189 if (output_distances) {
190 if constexpr(same_internal_distance) {
191 remove_self(*output_distances, at);
192 } else {
193 copy_skip_self(my_distances, *output_distances, at);
194 }
195 }
196 }
197
198private:
199 void search_raw(const AnnoyData_* query, Index_ k, std::vector<Index_>* output_indices, std::vector<Distance_>* output_distances) {
200 auto ptrs = obtain_pointers(output_indices, output_distances, k);
201 auto icopy_ptr = ptrs.first;
202 auto dcopy_ptr = ptrs.second;
203
204 my_parent.my_index.get_nns_by_vector(
205 query,
206 sanisizer::cast<std::size_t>(sanisizer::attest_gez(k)),
207 get_search_k(k),
208 icopy_ptr,
209 dcopy_ptr
210 );
211
212 if (output_indices) {
213 if constexpr(!same_internal_index) {
214 output_indices->clear();
215 output_indices->insert(output_indices->end(), my_indices.begin(), my_indices.end());
216 }
217 }
218
219 if (output_distances) {
220 if constexpr(!same_internal_distance) {
221 output_distances->clear();
222 output_distances->insert(output_distances->end(), my_distances.begin(), my_distances.end());
223 }
224 }
225 }
226
227public:
228 void search(const Data_* query, Index_ k, std::vector<Index_>* output_indices, std::vector<Distance_>* output_distances) {
229 if constexpr(same_internal_data) {
230 search_raw(query, k, output_indices, output_distances);
231 } else {
232 std::copy_n(query, my_parent.my_dim, my_buffer.begin());
233 search_raw(my_buffer.data(), k, output_indices, output_distances);
234 }
235 }
236};
237
238template<
239 typename Index_,
240 typename Data_,
241 typename Distance_,
242 class AnnoyDistance_,
243 typename AnnoyIndex_ = Index_,
244 typename AnnoyData_ = float,
245 class AnnoyRng_ = Annoy::Kiss64Random,
246 class AnnoyThreadPolicy_ = Annoy::AnnoyIndexSingleThreadedBuildPolicy
247>
248class AnnoyPrebuilt final : public knncolle::Prebuilt<Index_, Data_, Distance_> {
249public:
250 template<class Matrix_>
251 AnnoyPrebuilt(const Matrix_& data, const AnnoyOptions& options) :
252 my_dim(data.num_dimensions()),
253 my_obs(data.num_observations()),
254 my_search_mult(options.search_mult),
255 my_index(my_dim)
256 {
257 // check that we can, in fact, represent the number of observations in the specified AnnoyIndex_ type.
258 sanisizer::cast<AnnoyIndex_>(sanisizer::attest_gez(my_obs));
259
260 auto work = data.new_known_extractor();
261 if constexpr(std::is_same<Data_, AnnoyData_>::value) {
262 for (Index_ i = 0; i < my_obs; ++i) {
263 auto ptr = work->next();
264 my_index.add_item(i, ptr);
265 }
266 } else {
267 auto incoming = sanisizer::create<std::vector<AnnoyData_> >(my_dim);
268 for (Index_ i = 0; i < my_obs; ++i) {
269 auto ptr = work->next();
270 std::copy_n(ptr, my_dim, incoming.begin());
271 my_index.add_item(i, incoming.data());
272 }
273 }
274
275 my_index.build(options.num_trees);
276 return;
277 }
278
279private:
280 std::size_t my_dim;
281 Index_ my_obs;
282 double my_search_mult;
283
284 // Unfortunately, AnnoyIndex::save is not const (and also unloads and
285 // reloads the index, for reasons I don't understand). So we manually
286 // handle the saving by looking into the protected members.
287 struct SuperHackyThing final : public Annoy::AnnoyIndex<AnnoyIndex_, AnnoyData_, AnnoyDistance_, AnnoyRng_, AnnoyThreadPolicy_> {
288 template<typename ... Args_>
289 SuperHackyThing(Args_&& ... args) : Annoy::AnnoyIndex<AnnoyIndex_, AnnoyData_, AnnoyDistance_, AnnoyRng_, AnnoyThreadPolicy_>(std::forward<Args_>(args)...) {}
290
291 auto get_nodes() const {
292 return this->_nodes;
293 }
294
295 auto get_s() const {
296 return this->_s;
297 }
298
299 auto get_n_nodes() const {
300 return this->_n_nodes;
301 }
302 };
303 SuperHackyThing my_index;
304
305 friend class AnnoySearcher<Index_, Data_, Distance_, AnnoyDistance_, AnnoyIndex_, AnnoyData_, AnnoyRng_, AnnoyThreadPolicy_>;
306
307public:
308 std::size_t num_dimensions() const {
309 return my_dim;
310 }
311
312 Index_ num_observations() const {
313 return my_obs;
314 }
315
316 std::unique_ptr<knncolle::Searcher<Index_, Data_, Distance_> > initialize() const {
317 return initialize_known();
318 }
319
320 auto initialize_known() const {
321 return std::make_unique<AnnoySearcher<Index_, Data_, Distance_, AnnoyDistance_, AnnoyIndex_, AnnoyData_, AnnoyRng_, AnnoyThreadPolicy_> >(*this);
322 }
323
324public:
325 void save(const std::string& prefix) const {
326 knncolle::quick_save(prefix + "ALGORITHM", save_name, std::strlen(save_name));
327 knncolle::quick_save(prefix + "num_obs", &my_obs, 1);
328 knncolle::quick_save(prefix + "num_dim", &my_dim, 1);
329 knncolle::quick_save(prefix + "search_mult", &my_search_mult, 1);
330
331 NumericType types[2];
332 types[0] = get_numeric_type<AnnoyIndex_>();
333 types[1] = get_numeric_type<AnnoyData_>();
334 knncolle::quick_save(prefix + "types", types, 2);
335
336 const auto dname = get_distance_name<AnnoyDistance_>();
337 knncolle::quick_save(prefix + "distance", dname, std::strlen(dname));
338
339 // Not bothering to save anything else; the RNG and thread policy
340 // should not be relevant once the index is built.
341
342 // For reasons unknown to us, AnnoyIndex::save() is not const, so we have to do it manually.
343 // Hopefully this will be fixed in the future.
344 const auto idxpath = prefix + "index";
345 knncolle::quick_save(idxpath, reinterpret_cast<char*>(my_index.get_nodes()), sanisizer::product<std::streamsize>(my_index.get_s(), my_index.get_n_nodes()));
346 }
347
348 AnnoyPrebuilt(const std::string prefix, std::size_t ndim) : my_dim(ndim), my_index(ndim) {
349 knncolle::quick_load(prefix + "num_obs", &my_obs, 1);
350 knncolle::quick_load(prefix + "search_mult", &my_search_mult, 1);
351
352 const auto idxpath = prefix + "index";
353 char* errbuf = NULL;
354 if (!my_index.load(idxpath.c_str(), true, &errbuf)) {
355 std::runtime_error ex(errbuf);
356 delete errbuf;
357 throw ex;
358 }
359 }
360};
392template<
393 typename Index_,
394 typename Data_,
395 typename Distance_,
396 class AnnoyDistance_,
397 typename AnnoyIndex_ = Index_,
398 typename AnnoyData_ = float,
399 class AnnoyRng_ = Annoy::Kiss64Random,
400 class AnnoyThreadPolicy_ = Annoy::AnnoyIndexSingleThreadedBuildPolicy,
401 class Matrix_ = knncolle::Matrix<Index_, Data_>
402>
403class AnnoyBuilder : public knncolle::Builder<Index_, Data_, Distance_, Matrix_> {
404private:
405 AnnoyOptions my_options;
406
407public:
411 AnnoyBuilder(AnnoyOptions options) : my_options(std::move(options)) {}
412
416 AnnoyBuilder() = default;
417
423 return my_options;
424 }
425
426public:
430 knncolle::Prebuilt<Index_, Data_, Distance_>* build_raw(const Matrix_& data) const {
431 return build_known_raw(data);
432 }
437public:
441 auto build_known_raw(const Matrix_& data) const {
442 return new AnnoyPrebuilt<Index_, Data_, Distance_, AnnoyDistance_, AnnoyIndex_, AnnoyData_, AnnoyRng_, AnnoyThreadPolicy_>(data, my_options);
443 }
444
448 auto build_known_unique(const Matrix_& data) const {
449 return std::unique_ptr<I<decltype(*build_known_raw(data))> >(build_known_raw(data));
450 }
451
455 auto build_known_shared(const Matrix_& data) const {
456 return std::shared_ptr<I<decltype(*build_known_raw(data))> >(build_known_raw(data));
457 }
458};
459
460}
461
462#endif
virtual Prebuilt< Index_, Data_, Distance_ > * build_raw(const Matrix_ &data) const=0
Perform an approximate nearest neighbor search with Annoy.
Definition Annoy.hpp:403
auto build_known_unique(const Matrix_ &data) const
Definition Annoy.hpp:448
AnnoyBuilder(AnnoyOptions options)
Definition Annoy.hpp:411
AnnoyOptions & get_options()
Definition Annoy.hpp:422
auto build_known_raw(const Matrix_ &data) const
Definition Annoy.hpp:441
auto build_known_shared(const Matrix_ &data) const
Definition Annoy.hpp:455
Approximate nearest neighbor search with Annoy.
Definition Annoy.hpp:23
NumericType
Definition utils.hpp:64
void quick_load(const std::string &path, Input_ *const contents, const Length_ length)
void quick_save(const std::string &path, const Input_ *const contents, const Length_ length)
Options for AnnoyBuilder().
Definition Annoy.hpp:28
int num_trees
Definition Annoy.hpp:33
double search_mult
Definition Annoy.hpp:40
Utilities for the Annoy wrappers.