-
Notifications
You must be signed in to change notification settings - Fork 85
/
index.cpp
183 lines (162 loc) · 6.17 KB
/
index.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
/*
Copyright (C) 2013,2014 Wei Dong <[email protected]>. All Rights Reserved.
*/
#ifndef KGRAPH_VALUE_TYPE
#define KGRAPH_VALUE_TYPE float
#endif
#include <sys/time.h>
#include <cctype>
#include <random>
#include <iomanip>
#include <type_traits>
#include <iostream>
#include <boost/timer/timer.hpp>
#include <boost/format.hpp>
#include <boost/program_options.hpp>
#include "kgraph.h"
#include "kgraph-data.h"
using namespace std;
using namespace boost;
using namespace boost::timer;
using namespace kgraph;
namespace po = boost::program_options;
typedef KGRAPH_VALUE_TYPE value_type;
int main (int argc, char *argv[]) {
string data_path;
string output_path;
KGraph::IndexParams params;
unsigned D;
unsigned skip;
unsigned gap;
unsigned synthetic;
float noise;
bool lshkit = true;
po::options_description desc_visible("General options");
desc_visible.add_options()
("help,h", "produce help message.")
("version,v", "print version information.")
("data", po::value(&data_path), "input path")
("output", po::value(&output_path), "output path")
(",K", po::value(¶ms.K)->default_value(default_K), "number of nearest neighbor")
("controls,C", po::value(¶ms.controls)->default_value(default_controls), "number of control pounsigneds")
;
po::options_description desc_hidden("Expert options");
desc_hidden.add_options()
("iterations,I", po::value(¶ms.iterations)->default_value(default_iterations), "")
(",S", po::value(¶ms.S)->default_value(default_S), "")
(",R", po::value(¶ms.R)->default_value(default_R), "")
(",L", po::value(¶ms.L)->default_value(default_L), "")
("delta", po::value(¶ms.delta)->default_value(default_delta), "")
("recall", po::value(¶ms.recall)->default_value(default_recall), "")
("prune", po::value(¶ms.prune)->default_value(default_prune), "")
("reverse", po::value(¶ms.reverse)->default_value(default_reverse), "")
("noise", po::value(&noise)->default_value(0), "noise")
("seed", po::value(¶ms.seed)->default_value(default_seed), "")
("dim,D", po::value(&D), "dimension, see format")
("skip", po::value(&skip)->default_value(0), "see format")
("gap", po::value(&gap)->default_value(0), "see format")
("raw", "read raw binary file, need to specify D.")
("synthetic", po::value(&synthetic)->default_value(0), "generate synthetic data, for performance evaluation only, specify number of points")
("l2norm", "l2-normalize data, so as to mimic cosine similarity")
;
po::options_description desc("Allowed options");
desc.add(desc_visible).add(desc_hidden);
po::positional_options_description p;
p.add("data", 1);
p.add("output", 1);
po::variables_map vm;
po::store(po::command_line_parser(argc, argv).options(desc).positional(p).run(), vm);
po::notify(vm);
if (vm.count("raw") == 1) {
lshkit = false;
}
if (vm.count("version")) {
cout << "KGraph version " << KGraph::version() << endl;
return 0;
}
if (vm.count("help")
|| (synthetic && (vm.count("dim") == 0 || vm.count("data")))
|| (!synthetic && (vm.count("data") == 0 || (vm.count("dim") == 0 && !lshkit)))) {
cout << "Usage: index [OTHER OPTIONS]... INPUT [OUTPUT]" << endl;
cout << desc_visible << endl;
cout << desc_hidden << endl;
return 0;
}
if (params.S == 0) {
params.S = params.K;
}
if (lshkit && (synthetic == 0)) { // read dimension information from the data file
static const unsigned LSHKIT_HEADER = 3;
ifstream is(data_path.c_str(), ios::binary);
unsigned header[LSHKIT_HEADER]; /* entry size, row, col */
is.read((char *)header, sizeof header);
BOOST_VERIFY(is);
BOOST_VERIFY(header[0] == sizeof(value_type));
is.close();
D = header[2];
skip = LSHKIT_HEADER * sizeof(unsigned);
gap = 0;
}
Matrix<value_type> data;
if (synthetic) {
if (!std::is_floating_point<value_type>::value) {
throw std::runtime_error("synthetic data not implemented for non-floating-point values.");
}
data.resize(synthetic, D);
cerr << "Generating synthetic data..." << endl;
default_random_engine rng(params.seed);
uniform_real_distribution<double> distribution(-1.0, 1.0);
data.zero(); // important to do that
for (unsigned i = 0; i < synthetic; ++i) {
value_type *row = data[i];
for (unsigned j = 0; j < D; ++j) {
row[j] = distribution(rng);
}
}
}
else {
data.load(data_path, D, skip, gap);
}
if (noise != 0) {
if (!std::is_floating_point<value_type>::value) {
throw std::runtime_error("noise injection not implemented for non-floating-point value.");
}
//tr1::ranlux64_base_01 rng;
std::default_random_engine rng;
double sum = 0, sum2 = 0;
for (unsigned i = 0; i < data.size(); ++i) {
for (unsigned j = 0; j < data.dim(); ++j) {
value_type v = data[i][j];
sum += v;
sum2 += v * v;
}
}
double total = double(data.size()) * data.dim();
double avg2 = sum2 / total, avg = sum / total;
double dev = sqrt(avg2 - avg * avg);
cerr << "Adding Gaussian noise w/ " << noise << "x sigma(" << dev << ")..." << endl;
std::normal_distribution<double> gaussian(0, noise * dev);
for (unsigned i = 0; i < data.size(); ++i) {
for (unsigned j = 0; j < data.dim(); ++j) {
data[i][j] += gaussian(rng);
}
}
}
if (vm.count("l2norm")) {
cerr << "L2-normalizing data..." << endl;
data.normalize2();
}
MatrixOracle<value_type, metric::l2sqr> oracle(data);
KGraph::IndexInfo info;
KGraph *kgraph = KGraph::create(); //(oracle, params, &info);
{
auto_cpu_timer timer;
kgraph->build(oracle, params, &info);
cerr << info.stop_condition << endl;
}
if (output_path.size()) {
kgraph->save(output_path.c_str());
}
delete kgraph;
return 0;
}