-
Notifications
You must be signed in to change notification settings - Fork 1
/
contraction.h
executable file
·280 lines (200 loc) · 10.8 KB
/
contraction.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
#ifndef _CONTRACTION_H_
#define _CONTRACTION_H_
#include <vector>
#include <stdlib.h>
#include <mpi.h>
#include <list>
#include <string>
#include <algorithm>
#include <map>
#include "tensor.h"
#include "helper.h"
#include "redistribute.h"
#include "localcontract.h"
#include "omp.h"
#include "grid.h"
#include "block_grid.h"
#include "transpose_helper.h"
#include "cost.h"
#include "grid_redib.h"
#define EXT_A 1
#define EXT_B 2
#define CONTR_AB 0
namespace RRR{
// Ideally extend this class and make different contractions from it
class Contraction
{
private:
Grid* grid;
int rank, num_procs;
int *my_address;
//these are debugging buffers to ensure that for every send there is a matching receive.
int* receiver_grid_buffer;
int* sender_grid_buffer;
int num_cntr_indices;
Tensor *A, *B, *C;
int dims_A, dims_B, dims_C;
//permutation map for A, B and C before performing local dgemm
//C also needs an inverse permutation map after the dgemm completess
int *p_map_A, *p_map_B, *p_map_C, *inv_p_map_C;
//stores the block range of C after it has been transposed
//this pock range is used for inverse transposing C back to
//its original layout
int* inverse_block_range_C;
//stores if a given index of C is external index of A or B
int* is_C_indx_ext_A_or_ext_B;
//parameters for dgemm
int n_a, n_b, n_k;
//dgemm flags for A, B and C
int t_A, t_B;
MPI_Request *send_req_addr, *send_req_data;
MPI_Request *recv_req;
// This mapping is used to find tile address of the output tensor
int* A_to_C_map;
int* B_to_C_map;
int num_senders;
int num_receivers;
int* current_senders;
int* current_receivers;
int grid_dims;
int *pgrid;
int serial;
int* self_adr_sizes;
double** self_buffers;
int** self_addresses;
int* self_data_sizes;
int num_self_sends;
// Performance metrics
int local_num_dgemm, total_num_dgemm;
double dgemm_time;
double redist_time, comm_time, tr_time, comp_time, bm_time, total_time;
double max_redist_time, max_comm_time, max_tr_time, max_comp_time, max_bm_time, max_total_time;
double instigation_time, forward_receive_time;
double block_size_forward_time, address_forward_time, data_forward_time;
double timer1, timer2, timer3, timer4, timer5, timer6;
double m_timer1, m_timer2, m_timer3, m_timer4, m_timer5, m_timer6;
double max_block_size_forward_time, max_address_forward_time, max_data_forward_time;
double max_instigation_time, max_forward_receive_time;
map<int, list<pair<int*,int*> > > CRCT_map;
//returns a permutation map that combines the affect of
//applying map1 and map2 consecutively
void compose_p_maps(Tensor* &T, int* &map1, int* &map2, int* &composed_map);
//generates permutation maps based on symmetry for bounced blocks
void get_symm_permutation(Tensor* &T, int* &tile_address, int* &sym_permutation);
//generates permutation maps for A, B and C for doing transpose before local dgemm can be performed
void generate_permutation_map(Tensor* &A, Tensor* &B, Tensor* &C, std::vector<pair<int,int> > &contr_list);
//gives the mapping of external indicies in the input to external
//indicies in the output. This mapping is used to find tile address
//of the output tensor. Also assigns the indicies of input and output
//as external index or internal index. Asigns the indicies of C
//as external index of A or external index of B
void fill_input_to_output_map(string* &input, int input_dims, string* &output, int output_dims, int* &input_to_output_map, int ext_input);
//stores the address of C corresponding to address of A and B in C_address
void get_C_address(int* &A_address,int dims_a, int* &B_address, int dims_b, int* &C_address, int dims_c);
/*Using the input tensors T_A and T_B, figures out the
* permuted addresses of T_C that can be generated by
* contracting T_A and T_B. These addresses will be a
* subset of addresses held by tensor T_C. The location of
* these addresses is given the map
* addresses_to_location */
void get_C_addresses(Tensor* &T_A, Tensor* &T_B, Tensor* &T_C, int* &C_addresses, map<int,int>* &address_to_location);
// Extracts blocks from the serialized input tensor that are required
// for computing blocks in C along a dimension that is serialized in the
// input tensor but distributed in C
int get_blks_from_serialized(int num_dim, string* &ext_dim, int* &ext_grid_dim, Tensor* &T, double* &blocks, int* &addresses);
// Finds out blocks required right now for instigation from a serialized tensor
int get_deserialized_blocks(Tensor* &T, double* &blocks, int* &addresses, int* &des_dims, int* &des_grid_dims, int &num_dims);
// Deserializes a tensor
int deserialize(Tensor* &T, double* &blocks, int* &addresses, int num_dim, int* &ext_T, int* &ext_grid_dim, std::list<int>* &sym_list);
// Deserialize the 2D input tensor T if the other input tensor S is 4D and the output is 2D
Tensor* deserialize_422(Tensor* &T, Tensor* &S);
// Asserts validity of specified contraction
void assert_contr_validity(Tensor* &A, Tensor* &B);
// Send data to instigator
void send_to_instigator( Tensor* &X, int contr_dim, int contr_idx, int &count_addr_sends, int &count_data_sends);
// Receive data at instigator
int recv_at_instigator(Tensor* &X, int contr_dim, int contr_idx, double* &blocks, int* &block_addrs, int send_addr_count, int send_data_count);
public:
// Send data to instigator
void send_to_instigator_rect( Tensor* &X, int contr_dim, int contr_idx, int &count_addr_sends, int &count_data_sends);
private:
// Receive data at instigator
int recv_at_instigator_rect(Tensor* &X, int contr_dim, int contr_idx, double* &blocks, int* &block_addrs, int send_addr_count, int send_data_count);
// Test for completion of bounce send receives of this processor
void test_comm_completion();
//waits until message sizes of all the senders have been recieved
void initial_wait(int num_senders, MPI_Request* &recv_req, int* senders);
// Merges multiple block sets into one removing duplicates
// Returns number of block copied
int merge_self_send_blocks(Tensor* &T, double* &out_blocks, int* &out_addr);
void forward_int(Tensor* A, int* &data, int size, int k, int cdim, bool is_instigator);
void forward_double(Tensor* A, double* &data, int size, int k, int cdim, bool is_instigator);
//forwards the data it holds and recieves the data from the precvious node
void rotate_forward_receive(Tensor* &T, int forward_receive_dim,
int &source_num_blocks,
double* &source_data, int* &source_addr,
int &receive_num_blocks,
double* &receive_data, int* &receive_addr);
// Returns the rank of the previous processor in given dimension
int prev_processor(int dim);
// Returns the rank of the next processor in given dimension
int next_processor(int dim);
// Parses given contraction string and return the number of dimensions
int parse_contr_str(std::string contr_str, std::string* &out);
// Check if redistribution is required for an input tensor
void check_redistr(Tensor* &T, Tensor* &C, int* &new_idmap, std::string* &t, std::string* &c);
// Check if the new idmaps are not aligning same contraction indices along one dimension,
// figure out new idmap by remapping contraction indices
void realign_new_idmap(int* &idmap_A, int* &idmap_B, vector<pair<int,int> > &contr_list,
bool &redistr_A, bool &redistr_B);
// Performs recursive SUMMA
void rec_summa(Tensor* &A, Tensor* &B, double* &C_buffer, std::vector<std::pair<int,int> > contr_list,
std::pair<int,int> prev_cdim1, std::pair<int,int> prev_cdim2);
void rec_summa1(Tensor* &A, Tensor* &B, Tensor* &C, vector<std::pair<int,int> > contr_list,
std::pair<int,int> prev_cdim1, std::pair<int,int> prev_cdim2);
void rec_summa2(Tensor* &A, Tensor* &B, Tensor* &C, vector<std::pair<int,int> > contr_list,
std::pair<int,int> prev_cdim1, std::pair<int,int> prev_cdim2);
void temp_rec_summa(Tensor* &A, Tensor* &B, Tensor* &C, vector<pair<int,int> > contr_list,
pair<int,int> prev_cdim1, pair<int,int> prev_cdim2);
// Perform transpose as required on input tensor blocks and call dgemm
void transpose_and_dgemm_preserve(int num_blocks_A, int num_blocks_B, double* &blocks_A, double* &blocks_B, int* &block_addr_A, int* &block_addr_B, double* &C_buffer);
// Perform transpose as required on input tensor blocks and call dgemm
void transpose_and_dgemm(int num_blocks_A, int num_blocks_B, double* &blocks_A, double* &blocks_B, int* &block_addr_A, int* &block_addr_B, double* &C_buffer);
// Perform 2D matrix multiplication
void kevin_dgemm(int n_a, int n_b, int n_k, double* &A, double* &B, double* &C, int at, int bt, double alpha);
// Print information about the performance of this contraction
void print_time_flops();
void CRCT_generate_map(vector<pair<int,int> > contr_list);
void CRCT_print_map();
list<pair<int*,int*> > CRCT_enumerate_AB_addrs(int* &C_addr, int num_contr_idx, vector<pair<int,int> > contr_list);
void CRCT_check_blocks(int* &addrs_A, int num_blocks_A, int* &addrs_B, int num_blocks_B);
void CRCT_final_validation();
void CRCT_get_AB_addr(list<pair<int*,int*> > &AB_addr_list, int* &A_addr, int* &B_addr,
vector<pair<int,int> > contr_list, int k, int num_contr_idx, int* &contr_dim_sizes);
public:
//Doesnt initialize anything
Contraction(Grid* &g);
// Constructor
Contraction(Tensor* &a, Tensor* &b, Tensor* &c, Grid* &g);
// Destructor
~Contraction();
//Changes the grid if a redistribution is required for contraction
void change_grid(Grid* &g);
// Processors send to instigator and instigator collects the data
int instigate_collection(Tensor* &X, int contr_dim, int contr_idx, double* &blocks, int* &clocks_addrs);
// Processors send to instigator and instigator collects the data in a rectangular grid
int instigate_collection_rect(Tensor* &X, int contr_dim, int contr_idx, double* &blocks, int* &clocks_addrs);
// Contraction procedure
void contract(std::string contr_str_A, std::string contr_str_B, std::string contr_str_C);
/*Reduces the reduction buffer along
* reduction_dims.allORreucex is used to choose wheather
* to do a reduce or an all reduce. THe reduction is done
* in place.*/
void reduction(int* &reduction_dims, double* &reduction_buffer, int count, int allORreducex);
void rotate(Tensor* &T_input, Tensor*& T_output, vector<pair<int,int> > &contr_list,
int is_rotate_A, list<int> &rotate_dims, Tensor* &T_rotate,
double* &data_buffer, int* &address_buffer, int &count);
void display_times();
};
}
#endif