diff --git a/Makefile b/Makefile index 0cf896c..851b6b1 100644 --- a/Makefile +++ b/Makefile @@ -2,8 +2,10 @@ CC=g++ NC=nvcc -CFLAGS=-std=c++11 -O3 -NFLAGS=-arch=sm_60 +CFLAGS=-std=c++14 -O3 + +#Tesla T4: sm_75 A100: sm_80 +NFLAGS=-arch=sm_75 SHARED=shared SUBWAY=subway @@ -14,13 +16,13 @@ DEP=$(SHARED)/timer.o $(SHARED)/argument_parsing.o $(SHARED)/graph.o $(SHARED)/s all: make1 make2 make3 bfs-sync cc-sync sssp-sync sswp-sync pr-sync bfs-async cc-async sssp-async sswp-async pr-async make1: - make -C $(SHARED) + make -C $(SHARED) NFLAGS=${NFLAGS} make2: - make -C $(SUBWAY) + make -C $(SUBWAY) NFLAGS=${NFLAGS} make3: - make -C $(TOOLS) + make -C $(TOOLS) NFLAGS=${NFLAGS} bfs-sync: $(SUBWAY)/bfs-sync.o $(DEP) diff --git a/README.md b/README.md index 19b164a..a5c1d98 100644 --- a/README.md +++ b/README.md @@ -29,13 +29,15 @@ Graph.wel ("SOURCE DESTINATION WEIGHT" for each edge in each line): 1 2 10 ``` +TAB is used as delimiter in both el and wel graph format. + To convert these graph files to the binary format, run the following commands in the root folder: ``` -tools/converter path_to_Graph.el -tools/converter path_to_Graph.wel +cat path_to_Graph.el|tools/converter_stdin el output_filename.bcsr +cat path_to_Graph.wel|tools/converter_stdin wel output_filename.bwcsr ``` -The first command converts Graph.el to the binary CSR format and generates a binary graph file with .bcsr extension under the same directory as the original file. The second command converts Graph.wel to a weighted binary graph file with .bwcsr extension. +The first command converts Graph.el to the binary CSR format and generates a binary graph file with bcsr extension. The second command converts Graph.wel to a weighted binary graph file with .bwcsr extension. #### Running applications in Subway The applications take a graph as input as well as some optional arguments. For example: diff --git a/shared/Makefile b/shared/Makefile index 32dd448..1ed59c9 100644 --- a/shared/Makefile +++ b/shared/Makefile @@ -1,7 +1,7 @@ CC=g++ NC=nvcc -CFLAGS=-std=c++11 -O3 -NFLAGS=-arch=sm_60 +CFLAGS=-std=c++14 -O3 +#NFLAGS=-arch=sm_80 all: timer.o argument_parsing.o graph.o subgraph.o partitioner.o subgraph_generator.o gpu_kernels.o subway_utilities.o test.o diff --git a/shared/argument_parsing.cu b/shared/argument_parsing.cu index 159d485..9377977 100644 --- a/shared/argument_parsing.cu +++ b/shared/argument_parsing.cu @@ -4,117 +4,117 @@ ArgumentParser::ArgumentParser(int argc, char **argv, bool canHaveSource, bool canHaveItrs) { - this->argc = argc; - this->argv = argv; - this->canHaveSource = canHaveSource; - this->canHaveItrs = canHaveItrs; - - this->sourceNode = 0; - this->deviceID = 0; - this->numberOfItrs = 1; - - hasInput = false; - hasSourceNode = false; - hasOutput = false; - hasDeviceID = false; - hasNumberOfItrs = false; - - Parse(); + this->argc = argc; + this->argv = argv; + this->canHaveSource = canHaveSource; + this->canHaveItrs = canHaveItrs; + + this->sourceNode = 0; + this->deviceID = 0; + this->numberOfItrs = 1; + + hasInput = false; + hasSourceNode = false; + hasOutput = false; + hasDeviceID = false; + hasNumberOfItrs = false; + + Parse(); } - + bool ArgumentParser::Parse() { - try - { - if(argc == 1) - { - cout << GenerateHelpString(); - exit(0); - } - - if(argc == 2) - if ((strcmp(argv[1], "--help") == 0) || - (strcmp(argv[1], "-help") == 0) || - (strcmp(argv[1], "--h") == 0) || - (strcmp(argv[1], "-h") == 0)) - { - cout << GenerateHelpString(); - exit(0); - } - - if(argc%2 == 0) - { - cout << "\nThere was an error parsing command line arguments\n"; - cout << GenerateHelpString(); - exit(0); - } - - - for(int i=1; i\n"; - cout << GenerateHelpString(); - exit(0); - } - } - - if(hasInput) - return true; - else - { - cout << "\nInput graph file argument is required.\n"; - cout << GenerateHelpString(); - exit(0); - } - } - catch( const std::exception& strException ) { - std::cerr << strException.what() << "\n"; - GenerateHelpString(); - exit(0); - } - catch(...) { - std::cerr << "An exception has occurred.\n"; - GenerateHelpString(); - exit(0); - } + try + { + if(argc == 1) + { + cout << GenerateHelpString(); + exit(0); + } + + if(argc == 2) + if ((strcmp(argv[1], "--help") == 0) || + (strcmp(argv[1], "-help") == 0) || + (strcmp(argv[1], "--h") == 0) || + (strcmp(argv[1], "-h") == 0)) + { + cout << GenerateHelpString(); + exit(0); + } + + if(argc%2 == 0) + { + cout << "\nThere was an error parsing command line arguments\n"; + cout << GenerateHelpString(); + exit(0); + } + + + for(int i=1; i\n"; + cout << GenerateHelpString(); + exit(0); + } + } + + if(hasInput) + return true; + else + { + cout << "\nInput graph file argument is required.\n"; + cout << GenerateHelpString(); + exit(0); + } + } + catch( const std::exception& strException ) { + std::cerr << strException.what() << "\n"; + GenerateHelpString(); + exit(0); + } + catch(...) { + std::cerr << "An exception has occurred.\n"; + GenerateHelpString(); + exit(0); + } } string ArgumentParser::GenerateHelpString(){ - string str = "\nRequired arguments:"; - str += "\n [--input]: Input graph file. E.g., --input FacebookGraph.txt"; - str += "\nOptional arguments"; - if(canHaveSource) - str += "\n [--source]: Begins from the source (Default: 0). E.g., --source 10"; - str += "\n [--output]: Output file for results. E.g., --output results.txt"; - str += "\n [--device]: Select GPU device (default: 0). E.g., --device 1"; - if(canHaveItrs) - str += "\n [--iteration]: Number of iterations (default: 1). E.g., --iterations 10"; - str += "\n\n"; - return str; + string str = "\nRequired arguments:"; + str += "\n [--input]: Input graph file. E.g., --input FacebookGraph.txt"; + str += "\nOptional arguments"; + if(canHaveSource) + str += "\n [--source]: Begins from the source (Default: 0). E.g., --source 10"; + str += "\n [--output]: Output file for results. E.g., --output results.txt"; + str += "\n [--device]: Select GPU device (default: 0). E.g., --device 1"; + if(canHaveItrs) + str += "\n [--iteration]: Number of iterations (default: 1). E.g., --iterations 10"; + str += "\n\n"; + return str; } diff --git a/shared/argument_parsing.cuh b/shared/argument_parsing.cuh index 25dd771..b8ba411 100644 --- a/shared/argument_parsing.cuh +++ b/shared/argument_parsing.cuh @@ -9,31 +9,31 @@ class ArgumentParser private: public: - int argc; - char** argv; - - bool canHaveSource; - bool canHaveItrs; - - bool hasInput; - bool hasSourceNode; - bool hasOutput; - bool hasDeviceID; - bool hasNumberOfItrs; - string input; - int sourceNode; - string output; - int deviceID; - int numberOfItrs; - - - ArgumentParser(int argc, char **argv, bool canHaveSource, bool canHaveItrs); - - bool Parse(); - - string GenerateHelpString(); - + int argc; + char** argv; + + bool canHaveSource; + bool canHaveItrs; + + bool hasInput; + bool hasSourceNode; + bool hasOutput; + bool hasDeviceID; + bool hasNumberOfItrs; + string input; + int sourceNode; + string output; + int deviceID; + int numberOfItrs; + + + ArgumentParser(int argc, char **argv, bool canHaveSource, bool canHaveItrs); + + bool Parse(); + + string GenerateHelpString(); + }; -#endif // ARGUMENT_PARSING_HPP +#endif // ARGUMENT_PARSING_HPP diff --git a/shared/globals.hpp b/shared/globals.hpp index 99b6c03..3fbe3f3 100644 --- a/shared/globals.hpp +++ b/shared/globals.hpp @@ -40,12 +40,12 @@ struct OutEdgeWeighted{ }; struct Edge{ - uint source; + uint source; uint end; }; struct EdgeWeighted{ - uint source; + uint source; uint end; uint w8; }; @@ -53,4 +53,4 @@ struct EdgeWeighted{ -#endif // GLOBALS_HPP +#endif // GLOBALS_HPP diff --git a/shared/gpu_error_check.cuh b/shared/gpu_error_check.cuh index 108d071..c175615 100644 --- a/shared/gpu_error_check.cuh +++ b/shared/gpu_error_check.cuh @@ -1,5 +1,5 @@ -#ifndef GPU_ERROR_CHECK_CUH -#define GPU_ERROR_CHECK_CUH +#ifndef GPU_ERROR_CHECK_CUH +#define GPU_ERROR_CHECK_CUH //#include //#include @@ -15,4 +15,4 @@ inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=t } } -#endif // GPU_ERROR_CHECK_CUH +#endif // GPU_ERROR_CHECK_CUH diff --git a/shared/gpu_kernels.cu b/shared/gpu_kernels.cu index 2955cee..ac11142 100644 --- a/shared/gpu_kernels.cu +++ b/shared/gpu_kernels.cu @@ -1,4 +1,3 @@ - #include "gpu_kernels.cuh" #include "globals.hpp" #include "gpu_error_check.cuh" @@ -6,530 +5,465 @@ #include "subgraph.cuh" -__global__ void bfs_kernel(unsigned int numNodes, - unsigned int from, - unsigned int numPartitionedEdges, - unsigned int *activeNodes, - unsigned int *activeNodesPointer, - OutEdge *edgeList, - unsigned int *outDegree, - unsigned int *value, - //bool *finished, - bool *label1, - bool *label2) +__global__ void bfs_kernel(uint numNodes, + uint from, + ull numPartitionedEdges, + uint *activeNodes, + ull *activeNodesPointer, + OutEdge *edgeList, + uint *outDegree, + uint *value, + bool *label1, + bool *label2) { - unsigned int tId = blockDim.x * blockIdx.x + threadIdx.x; - - if(tId < numNodes) - { - unsigned int id = activeNodes[from + tId]; - - if(label1[id] == false) - return; - - label1[id] = false; - - unsigned int sourceWeight = value[id]; - - unsigned int thisFrom = activeNodesPointer[from+tId]-numPartitionedEdges; - unsigned int degree = outDegree[id]; - unsigned int thisTo = thisFrom + degree; - - //printf("******* %i\n", thisFrom); - - unsigned int finalDist; - - for(unsigned int i=thisFrom; i dist[edgeList[i].end]) - { - atomicMax(&dist[edgeList[i].end] , finalDist); - - //*finished = false; - - //label1[edgeList[i].end] = true; - - label2[edgeList[i].end] = true; - } - } - } + uint tId = blockDim.x * blockIdx.x + threadIdx.x; + + if(tId < numNodes) + { + uint id = activeNodes[from + tId]; + if(label1[id] == false) + return; + label1[id] = false; + uint sourceWeight = dist[id]; + + ull thisFrom = activeNodesPointer[from+tId]-numPartitionedEdges; + uint degree = outDegree[id]; + ull thisTo = thisFrom + degree; + //printf("******* %i\n", thisFrom); + uint finalDist; + for(ull i=thisFrom; i dist[edgeList[i].end]) + { + atomicMax(&dist[edgeList[i].end] , finalDist); + + //*finished = false; + //label1[edgeList[i].end] = true; + + label2[edgeList[i].end] = true; + } + } + } } -__global__ void pr_kernel(unsigned int numNodes, - unsigned int from, - unsigned int numPartitionedEdges, - unsigned int *activeNodes, - unsigned int *activeNodesPointer, - OutEdge *edgeList, - unsigned int *outDegree, - float *dist, - float *delta, - //bool *finished, - float acc) +__global__ void pr_kernel(uint numNodes, + uint from, + ull numPartitionedEdges, + uint *activeNodes, + ull *activeNodesPointer, + OutEdge *edgeList, + uint *outDegree, + float *dist, + float *delta, + //bool *finished, + float acc) { - unsigned int tId = blockDim.x * blockIdx.x + threadIdx.x; - - if(tId < numNodes) - { - unsigned int id = activeNodes[from + tId]; - unsigned int degree = outDegree[id]; - float thisDelta = delta[id]; - - if(thisDelta > acc) - { - dist[id] += thisDelta; - - if(degree != 0) - { - //*finished = false; - - float sourcePR = ((float) thisDelta / degree) * 0.85; - - unsigned int thisfrom = activeNodesPointer[from+tId]-numPartitionedEdges; - unsigned int thisto = thisfrom + degree; - - for(unsigned int i=thisfrom; i acc) + { + dist[id] += thisDelta; + if(degree != 0) + { + //*finished = false; + float sourcePR = ((float) thisDelta / degree) * 0.85; + + ull thisfrom = activeNodesPointer[from+tId]-numPartitionedEdges; + ull thisto = thisfrom + degree; + for(ull i=thisfrom; i dist[edgeList[i].end]) - { - atomicMax(&dist[edgeList[i].end] , finalDist); - - *finished = false; - - //label1[edgeList[i].end] = true; - - label2[edgeList[i].end] = true; - } - } - } + uint tId = blockDim.x * blockIdx.x + threadIdx.x; + + if(tId < numNodes) + { + uint id = activeNodes[from + tId]; + if(label1[id] == false) + return; + label1[id] = false; + uint sourceWeight = dist[id]; + + ull thisFrom = activeNodesPointer[from+tId]-numPartitionedEdges; + uint degree = outDegree[id]; + uint thisTo = thisFrom + degree; + uint finalDist; + for(ull i=thisFrom; i dist[edgeList[i].end]) + { + atomicMax(&dist[edgeList[i].end] , finalDist); + + *finished = false; + //label1[edgeList[i].end] = true; + + label2[edgeList[i].end] = true; + } + } + } } -__global__ void cc_async(unsigned int numNodes, - unsigned int from, - unsigned int numPartitionedEdges, - unsigned int *activeNodes, - unsigned int *activeNodesPointer, - OutEdge *edgeList, - unsigned int *outDegree, - unsigned int *dist, - bool *finished, - bool *label1, - bool *label2) +__global__ void cc_async(uint numNodes, + uint from, + ull numPartitionedEdges, + uint *activeNodes, + ull *activeNodesPointer, + OutEdge *edgeList, + uint *outDegree, + uint *dist, + bool *finished, + bool *label1, + bool *label2) { - unsigned int tId = blockDim.x * blockIdx.x + threadIdx.x; - - if(tId < numNodes) - { - unsigned int id = activeNodes[from + tId]; - - if(label1[id] == false) - return; - - label1[id] = false; - - unsigned int sourceWeight = dist[id]; - - unsigned int thisFrom = activeNodesPointer[from+tId]-numPartitionedEdges; - unsigned int degree = outDegree[id]; - unsigned int thisTo = thisFrom + degree; - - //printf("******* %i\n", thisFrom); - - //unsigned int finalDist; - - for(unsigned int i=thisFrom; i acc) - { - dist[id] += thisDelta; - - if(degree != 0) - { - *finished = false; - - float sourcePR = ((float) thisDelta / degree) * 0.85; - - unsigned int thisfrom = activeNodesPointer[from+tId]-numPartitionedEdges; - unsigned int thisto = thisfrom + degree; - - for(unsigned int i=thisfrom; i acc) + { + dist[id] += thisDelta; + if(degree != 0) + { + *finished = false; + float sourcePR = ((float) thisDelta / degree) * 0.85; + + ull thisfrom = activeNodesPointer[from+tId]-numPartitionedEdges; + ull thisto = thisfrom + degree; + for(ull i=thisfrom; i Graph::Graph(string graphFilePath, bool isWeighted) { - this->graphFilePath = graphFilePath; - this->isWeighted = isWeighted; + this->graphFilePath = graphFilePath; + this->isWeighted = isWeighted; } template @@ -31,169 +31,42 @@ void Graph::AssignW8(uint w8, uint index) template void Graph::ReadGraph() { + cout << "Reading the input graph from the following file:\n>> " << graphFilePath << endl; + this->graphFormat = GetFileExtension(graphFilePath); + if(graphFormat == "bcsr" || graphFormat == "bwcsr") + { + ifstream infile (graphFilePath, ios::in | ios::binary); + infile.read ((char*)&num_nodes, sizeof(uint)); + infile.read ((char*)&num_edges, sizeof(ull)); + nodePointer = new ull[num_nodes+1]; + gpuErrorcheck(cudaMallocHost(&edgeList, (num_edges) * sizeof(E))); + ull num_node = num_nodes; + infile.read ((char*)nodePointer, (num_node + 1 ) * sizeof(ull) ); + infile.read ((char*)edgeList, num_edges * sizeof(E) ); + std::cout << nodePointer[num_nodes] << std::endl; + } + else + { + cout << "The graph format is not supported!\n"; + exit(-1); + } + outDegree = new uint[num_nodes]; + for(uint i=1; i> " << graphFilePath << endl; - - this->graphFormat = GetFileExtension(graphFilePath); - - if(graphFormat == "bcsr" || graphFormat == "bwcsr") - { - ifstream infile (graphFilePath, ios::in | ios::binary); - - infile.read ((char*)&num_nodes, sizeof(uint)); - infile.read ((char*)&num_edges, sizeof(uint)); - - nodePointer = new uint[num_nodes+1]; - gpuErrorcheck(cudaMallocHost(&edgeList, (num_edges) * sizeof(E))); - - infile.read ((char*)nodePointer, sizeof(uint)*num_nodes); - infile.read ((char*)edgeList, sizeof(E)*num_edges); - nodePointer[num_nodes] = num_edges; - } - else if(graphFormat == "el" || graphFormat == "wel") - { - ifstream infile; - infile.open(graphFilePath); - stringstream ss; - uint max = 0; - string line; - uint edgeCounter = 0; - if(isWeighted) - { - vector edges; - EdgeWeighted newEdge; - while(getline( infile, line )) - { - ss.str(""); - ss.clear(); - ss << line; - - ss >> newEdge.source; - ss >> newEdge.end; - ss >> newEdge.w8; - - edges.push_back(newEdge); - edgeCounter++; - - if(max < newEdge.source) - max = newEdge.source; - if(max < newEdge.end) - max = newEdge.end; - } - infile.close(); - num_nodes = max + 1; - num_edges = edgeCounter; - nodePointer = new uint[num_nodes+1]; - gpuErrorcheck(cudaMallocHost(&edgeList, (num_edges) * sizeof(E))); - uint *degree = new uint[num_nodes]; - for(uint i=0; i edges; - Edge newEdge; - while(getline( infile, line )) - { - ss.str(""); - ss.clear(); - ss << line; - - ss >> newEdge.source; - ss >> newEdge.end; - - edges.push_back(newEdge); - edgeCounter++; - - if(max < newEdge.source) - max = newEdge.source; - if(max < newEdge.end) - max = newEdge.end; - } - infile.close(); - num_nodes = max + 1; - num_edges = edgeCounter; - nodePointer = new uint[num_nodes+1]; - gpuErrorcheck(cudaMallocHost(&edgeList, (num_edges) * sizeof(E))); - uint *degree = new uint[num_nodes]; - for(uint i=0; i> " << graphFilePath << endl; - - this->graphFormat = GetFileExtension(graphFilePath); - - if(graphFormat == "bcsr" || graphFormat == "bwcsr") - { - ifstream infile (graphFilePath, ios::in | ios::binary); - - infile.read ((char*)&num_nodes, sizeof(uint)); - infile.read ((char*)&num_edges, sizeof(uint)); - - nodePointer = new uint[num_nodes+1]; - gpuErrorcheck(cudaMallocHost(&edgeList, (num_edges) * sizeof(E))); - - infile.read ((char*)nodePointer, sizeof(uint)*num_nodes); - infile.read ((char*)edgeList, sizeof(E)*num_edges); - nodePointer[num_nodes] = num_edges; - } - else if(graphFormat == "el" || graphFormat == "wel") - { - ifstream infile; - infile.open(graphFilePath); - stringstream ss; - uint max = 0; - string line; - uint edgeCounter = 0; - if(isWeighted) - { - vector edges; - EdgeWeighted newEdge; - while(getline( infile, line )) - { - ss.str(""); - ss.clear(); - ss << line; - - ss >> newEdge.source; - ss >> newEdge.end; - ss >> newEdge.w8; - - edges.push_back(newEdge); - edgeCounter++; - - if(max < newEdge.source) - max = newEdge.source; - if(max < newEdge.end) - max = newEdge.end; - } - infile.close(); - num_nodes = max + 1; - num_edges = edgeCounter; - nodePointer = new uint[num_nodes+1]; - gpuErrorcheck(cudaMallocHost(&edgeList, (num_edges) * sizeof(E))); - uint *degree = new uint[num_nodes]; - for(uint i=0; i edges; - Edge newEdge; - while(getline( infile, line )) - { - ss.str(""); - ss.clear(); - ss << line; - - ss >> newEdge.source; - ss >> newEdge.end; - - edges.push_back(newEdge); - edgeCounter++; - - if(max < newEdge.source) - max = newEdge.source; - if(max < newEdge.end) - max = newEdge.end; - } - infile.close(); - num_nodes = max + 1; - num_edges = edgeCounter; - nodePointer = new uint[num_nodes+1]; - gpuErrorcheck(cudaMallocHost(&edgeList, (num_edges) * sizeof(E))); - uint *degree = new uint[num_nodes]; - for(uint i=0; i> " << graphFilePath << endl; + this->graphFormat = GetFileExtension(graphFilePath); + if(graphFormat == "bcsr" || graphFormat == "bwcsr") + { + ifstream infile (graphFilePath, ios::in | ios::binary); + infile.read ((char*)&num_nodes, sizeof(uint)); + infile.read ((char*)&num_edges, sizeof(ull)); + nodePointer = new ull[num_nodes+1]; + gpuErrorcheck(cudaMallocHost(&edgeList, (num_edges) * sizeof(E))); + ull n = num_nodes; + infile.read ((char*)nodePointer, (n+1) * sizeof(ull)); + infile.read ((char*)edgeList, num_edges * sizeof(E)); + nodePointer[num_nodes] = num_edges; + } + else if(graphFormat == "el" || graphFormat == "wel") + { + ifstream infile; + infile.open(graphFilePath); + stringstream ss; + uint max = 0; + string line; + ull edgeCounter = 0; + if(isWeighted) + { + vector edges; + EdgeWeighted newEdge; + while(getline( infile, line )) + { + ss.str(""); + ss.clear(); + ss << line; + ss >> newEdge.source; + ss >> newEdge.end; + ss >> newEdge.w8; + edges.push_back(newEdge); + edgeCounter++; + if(max < newEdge.source) + max = newEdge.source; + if(max < newEdge.end) + max = newEdge.end; + } + infile.close(); + num_nodes = max + 1; + num_edges = edgeCounter; + nodePointer = new ull[num_nodes+1]; + gpuErrorcheck(cudaMallocHost(&edgeList, (num_edges) * sizeof(E))); + uint *degree = new uint[num_nodes]; + for(uint i=0; i edges; + Edge newEdge; + while(getline( infile, line )) + { + ss.str(""); + ss.clear(); + ss << line; + ss >> newEdge.source; + ss >> newEdge.end; + edges.push_back(newEdge); + edgeCounter++; + if(max < newEdge.source) + max = newEdge.source; + if(max < newEdge.end) + max = newEdge.end; + } + infile.close(); + num_nodes = max + 1; + num_edges = edgeCounter; + nodePointer = new ull[num_nodes+1]; + gpuErrorcheck(cudaMallocHost(&edgeList, (num_edges) * sizeof(E))); + uint *degree = new uint[num_nodes]; + for(uint i=0; i Partitioner::Partitioner() { - reset(); + reset(); } template void Partitioner::partition(Subgraph &subgraph, uint numActiveNodes) { - reset(); - - unsigned int from, to; - unsigned int left, right, mid; - unsigned int partitionSize; - unsigned int numNodesInPartition; - unsigned int numPartitionedEdges; - bool foundTo; - unsigned int accurCount; - - - from = 0; - to = numActiveNodes; // last in pointers - numPartitionedEdges = 0; - - do - { - left = from; - right = numActiveNodes; + reset(); + uint from, to; + uint left, right, mid; + ull partitionSize; + uint numNodesInPartition; + ull numPartitionedEdges; + bool foundTo; + ull accurCount; + from = 0; + to = numActiveNodes; // last in pointers + numPartitionedEdges = 0; + do + { + left = from; + right = numActiveNodes; - //cout << "#active nodes: " << numActiveNodes << endl; - //cout << "left: " << left << " right: " << right << endl; - //cout << "pointer to left: " << subgraph.activeNodesPointer[left] << " pointer to right: " << subgraph.activeNodesPointer[right] << endl; + std::cout << "#active nodes: " << numActiveNodes << std::endl; + std::cout << "left: " << left << " right: " << right << std::endl; + std::cout << "pointer to left: " << subgraph.activeNodesPointer[left] << " pointer to right: " << subgraph.activeNodesPointer[right] << std::endl; - partitionSize = subgraph.activeNodesPointer[right] - subgraph.activeNodesPointer[left]; - if(partitionSize <= subgraph.max_partition_size) - { - to = right; - } - else - { - foundTo = false; - accurCount = 10; - while(foundTo==false || accurCount>0) - { - mid = (left + right)/2; - partitionSize = subgraph.activeNodesPointer[mid] - subgraph.activeNodesPointer[from]; - if(foundTo == true) - accurCount--; - if(partitionSize <= subgraph.max_partition_size) - { - left = mid; - to = mid; - foundTo = true; - } - else - { - right = mid; - } - } - + partitionSize = subgraph.activeNodesPointer[right] - subgraph.activeNodesPointer[left]; + //std::cout << "partitionSize: " << partitionSize << std::endl; - if(to == numActiveNodes) - { - cout << "Error in Partitioning...\n"; - exit(-1); - } + if(partitionSize <= subgraph.max_partition_size) + { + to = right; + } + else + { + foundTo = false; + accurCount = 10; + while(foundTo==false || accurCount>0) + { + mid = (left + right)/2; + partitionSize = subgraph.activeNodesPointer[mid] - subgraph.activeNodesPointer[from]; + if(foundTo == true) + accurCount--; + if(partitionSize <= subgraph.max_partition_size) + { + left = mid; + to = mid; + foundTo = true; + } + else + { + right = mid; + } + } + if(to == numActiveNodes) + { + cout << "Error in Partitioning...\n"; + exit(-1); + } - } + } - partitionSize = subgraph.activeNodesPointer[to] - subgraph.activeNodesPointer[from]; - numNodesInPartition = to - from; + partitionSize = subgraph.activeNodesPointer[to] - subgraph.activeNodesPointer[from]; + numNodesInPartition = to - from; - //cout << "from: " << from << " to: " << to << endl; - //cout << "#nodes in P: " << numNodesInPartition << " #edges in P: " << partitionSize << endl; - - fromNode.push_back(from); - fromEdge.push_back(numPartitionedEdges); - partitionNodeSize.push_back(numNodesInPartition); - partitionEdgeSize.push_back(partitionSize); - - from = to; - numPartitionedEdges += partitionSize; - - } while (to != numActiveNodes); - - numPartitions = fromNode.size(); + //std::cout << "from: " << from << " to: " << to << std::endl; + //std::cout << "#nodes in P: " << numNodesInPartition << " #edges in P: " << partitionSize << std::endl; + fromNode.push_back(from); + fromEdge.push_back(numPartitionedEdges); + partitionNodeSize.push_back(numNodesInPartition); + partitionEdgeSize.push_back(partitionSize); + from = to; + numPartitionedEdges += partitionSize; + } while (to != numActiveNodes); + numPartitions = fromNode.size(); } template void Partitioner::reset() { - fromNode.clear(); - fromEdge.clear(); - partitionNodeSize.clear(); - partitionEdgeSize.clear(); - numPartitions = 0; + fromNode.clear(); + fromEdge.clear(); + partitionNodeSize.clear(); + partitionEdgeSize.clear(); + numPartitions = 0; } template class Partitioner; diff --git a/shared/partitioner.cuh b/shared/partitioner.cuh index 8a1e1fb..9118037 100644 --- a/shared/partitioner.cuh +++ b/shared/partitioner.cuh @@ -11,17 +11,17 @@ class Partitioner private: public: - uint numPartitions; - vector fromNode; - vector fromEdge; - vector partitionNodeSize; - vector partitionEdgeSize; - Partitioner(); + uint numPartitions; + vector fromNode; + vector fromEdge; + vector partitionNodeSize; + vector partitionEdgeSize; + Partitioner(); void partition(Subgraph &subgraph, uint numActiveNodes); void reset(); }; -#endif // PARTITIONER_CUH +#endif // PARTITIONER_CUH diff --git a/shared/subgraph.cu b/shared/subgraph.cu index bef7f7c..09c6291 100644 --- a/shared/subgraph.cu +++ b/shared/subgraph.cu @@ -1,44 +1,50 @@ - #include "subgraph.cuh" #include "gpu_error_check.cuh" #include "graph.cuh" #include - +#include template -Subgraph::Subgraph(uint num_nodes, uint num_edges) +Subgraph::Subgraph(uint num_nodes, ull num_edges) { - cudaProfilerStart(); - cudaError_t error; - cudaDeviceProp dev; - int deviceID; - cudaGetDevice(&deviceID); - error = cudaGetDeviceProperties(&dev, deviceID); - if(error != cudaSuccess) - { - printf("Error: %s\n", cudaGetErrorString(error)); - exit(-1); - } - cudaProfilerStop(); - - max_partition_size = 0.9 * (dev.totalGlobalMem - 8*4*num_nodes) / sizeof(E); - //max_partition_size = 1000000000; - - if(max_partition_size > DIST_INFINITY) - max_partition_size = DIST_INFINITY; - - //cout << "Max Partition Size: " << max_partition_size << endl; - - this->num_nodes = num_nodes; - this->num_edges = num_edges; - - gpuErrorcheck(cudaMallocHost(&activeNodes, num_nodes * sizeof(uint))); - gpuErrorcheck(cudaMallocHost(&activeNodesPointer, (num_nodes+1) * sizeof(uint))); - gpuErrorcheck(cudaMallocHost(&activeEdgeList, num_edges * sizeof(E))); - - gpuErrorcheck(cudaMalloc(&d_activeNodes, num_nodes * sizeof(unsigned int))); - gpuErrorcheck(cudaMalloc(&d_activeNodesPointer, (num_nodes+1) * sizeof(unsigned int))); - gpuErrorcheck(cudaMalloc(&d_activeEdgeList, (max_partition_size) * sizeof(E))); + cudaProfilerStart(); + cudaError_t error; + cudaDeviceProp dev; + int deviceID; + cudaGetDevice(&deviceID); + error = cudaGetDeviceProperties(&dev, deviceID); + if(error != cudaSuccess) + { + printf("Error: %s\n", cudaGetErrorString(error)); + exit(-1); + } + cudaProfilerStop(); + + std::cout << "num_edges: " << num_edges << std::endl; + std::cout << "device memory(bytes): " << dev.totalGlobalMem << std::endl; + + float estimated_gpu_memory_size = float(num_nodes) * 20 * 4; + if (dev.totalGlobalMem > estimated_gpu_memory_size ){ + max_partition_size = 0.9 * (dev.totalGlobalMem - estimated_gpu_memory_size) / sizeof(E); + }else { + std::cout << "no sufficient memory" << std::endl; + exit(-1); + } + //if(max_partition_size > DIST_INFINITY) + // max_partition_size = DIST_INFINITY; + std::cout << "Max Partition Size: " << max_partition_size << std::endl; + this->num_nodes = num_nodes; + this->num_edges = num_edges; + + ull m = num_nodes; + gpuErrorcheck(cudaMallocHost(&activeNodes, m * sizeof(uint))); + gpuErrorcheck(cudaMallocHost(&activeNodesPointer, (m+1) * sizeof(ull))); + gpuErrorcheck(cudaMallocHost(&activeEdgeList, num_edges * sizeof(E))); + + gpuErrorcheck(cudaMalloc(&d_activeNodes, m * sizeof(uint))); + gpuErrorcheck(cudaMalloc(&d_activeNodesPointer, (m+1) * sizeof(ull))); + gpuErrorcheck(cudaMalloc(&d_activeEdgeList, (max_partition_size) * sizeof(E))); + std::cout << "subgraph .." << std::endl; } template class Subgraph; @@ -48,7 +54,7 @@ template class Subgraph; //unsigned int numActiveNodes = 1; //subgraph.activeNodes[0] = SOURCE_NODE; //for(unsigned int i=graph.nodePointer[SOURCE_NODE], j=0; i acc) - { - activeNodesLabeling[id] = 1; - } - else - { - activeNodesLabeling[id] = 0; - } - activeNodesDegree[id] = 0; - if(activeNodesLabeling[id] == 1) - activeNodesDegree[id] = outDegree[id]; - } + uint id = blockDim.x * blockIdx.x + threadIdx.x; + if(id < numNodes){ + if(delta[id] > acc) + { + activeNodesLabeling[id] = 1; + } + else + { + activeNodesLabeling[id] = 0; + } + activeNodesDegree[id] = 0; + if(activeNodesLabeling[id] == 1) + activeNodesDegree[id] = outDegree[id]; + } } -__global__ void makeQueue(unsigned int *activeNodes, unsigned int *activeNodesLabeling, - unsigned int *prefixLabeling, unsigned int numNodes) +__global__ void makeQueue(uint *activeNodes, uint *activeNodesLabeling, + uint *prefixLabeling, uint numNodes) { - unsigned int id = blockDim.x * blockIdx.x + threadIdx.x; - if(id < numNodes && activeNodesLabeling[id] == 1){ - activeNodes[prefixLabeling[id]] = id; - } + uint id = blockDim.x * blockIdx.x + threadIdx.x; + if(id < numNodes && activeNodesLabeling[id] == 1){ + activeNodes[prefixLabeling[id]] = id; + } } -__global__ void makeActiveNodesPointer(unsigned int *activeNodesPointer, unsigned int *activeNodesLabeling, - unsigned int *prefixLabeling, unsigned int *prefixSumDegrees, - unsigned int numNodes) +__global__ void makeActiveNodesPointer(ull *activeNodesPointer, uint *activeNodesLabeling, + uint *prefixLabeling, ull *prefixSumDegrees, + uint numNodes) { - unsigned int id = blockDim.x * blockIdx.x + threadIdx.x; - if(id < numNodes && activeNodesLabeling[id] == 1){ - activeNodesPointer[prefixLabeling[id]] = prefixSumDegrees[id]; - } + uint id = blockDim.x * blockIdx.x + threadIdx.x; + if(id < numNodes && activeNodesLabeling[id] == 1){ + activeNodesPointer[prefixLabeling[id]] = prefixSumDegrees[id]; + } } // pthread template -void dynamic(unsigned int tId, - unsigned int numThreads, - unsigned int numActiveNodes, - unsigned int *activeNodes, - unsigned int *outDegree, - unsigned int *activeNodesPointer, - unsigned int *nodePointer, - E *activeEdgeList, - E *edgeList) +void dynamic(uint tId, + uint numThreads, + uint numActiveNodes, + uint *activeNodes, + uint *outDegree, + ull *activeNodesPointer, + ull *nodePointer, + E *activeEdgeList, + E *edgeList) { - unsigned int chunkSize = ceil(numActiveNodes / numThreads); - unsigned int left, right; - left = tId * chunkSize; - right = min(left+chunkSize, numActiveNodes); - - unsigned int thisNode; - unsigned int thisDegree; - unsigned int fromHere; - unsigned int fromThere; - - for(unsigned int i=left; i SubgraphGenerator::SubgraphGenerator(Graph &graph) { - gpuErrorcheck(cudaMallocHost(&activeNodesLabeling, graph.num_nodes * sizeof(unsigned int))); - gpuErrorcheck(cudaMallocHost(&activeNodesDegree, graph.num_nodes * sizeof(unsigned int))); - gpuErrorcheck(cudaMallocHost(&prefixLabeling, graph.num_nodes * sizeof(unsigned int))); - gpuErrorcheck(cudaMallocHost(&prefixSumDegrees, (graph.num_nodes+1) * sizeof(unsigned int))); - - gpuErrorcheck(cudaMalloc(&d_activeNodesLabeling, graph.num_nodes * sizeof(unsigned int))); - gpuErrorcheck(cudaMalloc(&d_activeNodesDegree, graph.num_nodes * sizeof(unsigned int))); - gpuErrorcheck(cudaMalloc(&d_prefixLabeling, graph.num_nodes * sizeof(unsigned int))); - gpuErrorcheck(cudaMalloc(&d_prefixSumDegrees , (graph.num_nodes+1) * sizeof(unsigned int))); + ull l = graph.num_nodes; + gpuErrorcheck(cudaMallocHost(&activeNodesLabeling, l * sizeof(uint))); + gpuErrorcheck(cudaMallocHost(&activeNodesDegree, l * sizeof(uint))); + gpuErrorcheck(cudaMallocHost(&prefixLabeling, l * sizeof(uint))); + gpuErrorcheck(cudaMallocHost(&prefixSumDegrees, (l+1) * sizeof(ull))); + + gpuErrorcheck(cudaMalloc(&d_activeNodesLabeling, l * sizeof(uint))); + gpuErrorcheck(cudaMalloc(&d_activeNodesDegree, l * sizeof(uint))); + gpuErrorcheck(cudaMalloc(&d_prefixLabeling, l * sizeof(uint))); + gpuErrorcheck(cudaMalloc(&d_prefixSumDegrees , (l+1) * sizeof(ull))); } template SubgraphGenerator::SubgraphGenerator(GraphPR &graph) { - gpuErrorcheck(cudaMallocHost(&activeNodesLabeling, graph.num_nodes * sizeof(unsigned int))); - gpuErrorcheck(cudaMallocHost(&activeNodesDegree, graph.num_nodes * sizeof(unsigned int))); - gpuErrorcheck(cudaMallocHost(&prefixLabeling, graph.num_nodes * sizeof(unsigned int))); - gpuErrorcheck(cudaMallocHost(&prefixSumDegrees, (graph.num_nodes+1) * sizeof(unsigned int))); - - gpuErrorcheck(cudaMalloc(&d_activeNodesLabeling, graph.num_nodes * sizeof(unsigned int))); - gpuErrorcheck(cudaMalloc(&d_activeNodesDegree, graph.num_nodes * sizeof(unsigned int))); - gpuErrorcheck(cudaMalloc(&d_prefixLabeling, graph.num_nodes * sizeof(unsigned int))); - gpuErrorcheck(cudaMalloc(&d_prefixSumDegrees , (graph.num_nodes+1) * sizeof(unsigned int))); + ull l = graph.num_nodes; + gpuErrorcheck(cudaMallocHost(&activeNodesLabeling, l * sizeof(uint))); + gpuErrorcheck(cudaMallocHost(&activeNodesDegree, l * sizeof(uint))); + gpuErrorcheck(cudaMallocHost(&prefixLabeling, l * sizeof(uint))); + gpuErrorcheck(cudaMallocHost(&prefixSumDegrees, (l+1) * sizeof(ull))); + + gpuErrorcheck(cudaMalloc(&d_activeNodesLabeling, l * sizeof(uint))); + gpuErrorcheck(cudaMalloc(&d_activeNodesDegree, l * sizeof(uint))); + gpuErrorcheck(cudaMalloc(&d_prefixLabeling, l * sizeof(uint))); + gpuErrorcheck(cudaMalloc(&d_prefixSumDegrees , (l+1) * sizeof(ull))); } template void SubgraphGenerator::generate(Graph &graph, Subgraph &subgraph) { - //std::chrono::time_point startDynG, finishDynG; - //startDynG = std::chrono::system_clock::now(); - - prePrefix<<>>(d_activeNodesLabeling, d_activeNodesDegree, graph.d_outDegree, graph.d_label1, graph.d_label2, graph.num_nodes); - - thrust::device_ptr ptr_labeling(d_activeNodesLabeling); - thrust::device_ptr ptr_labeling_prefixsum(d_prefixLabeling); - - subgraph.numActiveNodes = thrust::reduce(ptr_labeling, ptr_labeling + graph.num_nodes); - //cout << "Number of Active Nodes = " << subgraph.numActiveNodes << endl; - - thrust::exclusive_scan(ptr_labeling, ptr_labeling + graph.num_nodes, ptr_labeling_prefixsum); - - makeQueue<<>>(subgraph.d_activeNodes, d_activeNodesLabeling, d_prefixLabeling, graph.num_nodes); - - gpuErrorcheck(cudaMemcpy(subgraph.activeNodes, subgraph.d_activeNodes, subgraph.numActiveNodes*sizeof(unsigned int), cudaMemcpyDeviceToHost)); - - thrust::device_ptr ptr_degrees(d_activeNodesDegree); - thrust::device_ptr ptr_degrees_prefixsum(d_prefixSumDegrees); - - thrust::exclusive_scan(ptr_degrees, ptr_degrees + graph.num_nodes, ptr_degrees_prefixsum); - - makeActiveNodesPointer<<>>(subgraph.d_activeNodesPointer, d_activeNodesLabeling, d_prefixLabeling, d_prefixSumDegrees, graph.num_nodes); - gpuErrorcheck(cudaMemcpy(subgraph.activeNodesPointer, subgraph.d_activeNodesPointer, subgraph.numActiveNodes*sizeof(unsigned int), cudaMemcpyDeviceToHost)); - - unsigned int numActiveEdges = 0; - if(subgraph.numActiveNodes>0) - numActiveEdges = subgraph.activeNodesPointer[subgraph.numActiveNodes-1] + graph.outDegree[subgraph.activeNodes[subgraph.numActiveNodes-1]]; - - unsigned int last = numActiveEdges; - gpuErrorcheck(cudaMemcpy(subgraph.d_activeNodesPointer+subgraph.numActiveNodes, &last, sizeof(unsigned int), cudaMemcpyHostToDevice)); - - gpuErrorcheck(cudaMemcpy(subgraph.activeNodesPointer, subgraph.d_activeNodesPointer, (subgraph.numActiveNodes+1)*sizeof(unsigned int), cudaMemcpyDeviceToHost)); - - - //finishDynG = std::chrono::system_clock::now(); - //std::chrono::duration elapsed_seconds_dyng = finishDynG-startDynG; - //std::time_t finish_time_dyng = std::chrono::system_clock::to_time_t(finishDynG); - //std::cout << "Dynamic GPU Time = " << elapsed_seconds_dyng.count() << std::endl; - - //td::chrono::time_point startDynC, finishDynC; - //startDynC = std::chrono::system_clock::now(); - - unsigned int numThreads = NUM_THREADS; - - if(subgraph.numActiveNodes < THRESHOLD_THREAD) - numThreads = 1; - - thread runThreads[numThreads]; - - for(unsigned int t=0; t, - t, - numThreads, - subgraph.numActiveNodes, - subgraph.activeNodes, - graph.outDegree, - subgraph.activeNodesPointer, - graph.nodePointer, - subgraph.activeEdgeList, - graph.edgeList); - - } - - for(unsigned int t=0; t elapsed_seconds_dync = finishDynC-startDynC; - //std::time_t finish_time_dync = std::chrono::system_clock::to_time_t(finishDynC); - //std::cout << "Dynamic CPU Time = " << elapsed_seconds_dync.count() << std::endl; - + //std::chrono::time_point startDynG, finishDynG; + //startDynG = std::chrono::system_clock::now(); + prePrefix<<>>(d_activeNodesLabeling, d_activeNodesDegree, graph.d_outDegree, graph.d_label1, graph.d_label2, graph.num_nodes); + thrust::device_ptr ptr_labeling(d_activeNodesLabeling); + thrust::device_ptr ptr_labeling_prefixsum(d_prefixLabeling); + subgraph.numActiveNodes = thrust::reduce(ptr_labeling, ptr_labeling + graph.num_nodes); + //std::cout << "Number of Active Nodes = " << subgraph.numActiveNodes << std::endl; + thrust::exclusive_scan(ptr_labeling, ptr_labeling + graph.num_nodes, ptr_labeling_prefixsum); + makeQueue<<>>(subgraph.d_activeNodes, d_activeNodesLabeling, d_prefixLabeling, graph.num_nodes); + gpuErrorcheck(cudaMemcpy(subgraph.activeNodes, subgraph.d_activeNodes, subgraph.numActiveNodes*sizeof(uint), cudaMemcpyDeviceToHost)); + thrust::device_ptr ptr_degrees(d_activeNodesDegree); + thrust::device_ptr ptr_degrees_prefixsum(d_prefixSumDegrees); + thrust::exclusive_scan(ptr_degrees, ptr_degrees + graph.num_nodes, ptr_degrees_prefixsum); + makeActiveNodesPointer<<>>(subgraph.d_activeNodesPointer, d_activeNodesLabeling, d_prefixLabeling, d_prefixSumDegrees, graph.num_nodes); + ull n = subgraph.numActiveNodes; + gpuErrorcheck(cudaMemcpy(subgraph.activeNodesPointer, subgraph.d_activeNodesPointer, n*sizeof(ull), cudaMemcpyDeviceToHost)); + ull numActiveEdges = 0; + if(subgraph.numActiveNodes>0) + numActiveEdges = subgraph.activeNodesPointer[subgraph.numActiveNodes-1] + graph.outDegree[subgraph.activeNodes[subgraph.numActiveNodes-1]]; + ull last = numActiveEdges; + gpuErrorcheck(cudaMemcpy(subgraph.d_activeNodesPointer+subgraph.numActiveNodes, &last, sizeof(ull), cudaMemcpyHostToDevice)); + gpuErrorcheck(cudaMemcpy(subgraph.activeNodesPointer, subgraph.d_activeNodesPointer, (n+1)*sizeof(ull), cudaMemcpyDeviceToHost)); + //finishDynG = std::chrono::system_clock::now(); + //std::chrono::duration elapsed_seconds_dyng = finishDynG-startDynG; + //std::time_t finish_time_dyng = std::chrono::system_clock::to_time_t(finishDynG); + //std::cout << "Dynamic GPU Time = " << elapsed_seconds_dyng.count() << std::endl; + //td::chrono::time_point startDynC, finishDynC; + //startDynC = std::chrono::system_clock::now(); + uint numThreads = NUM_THREADS; + + if(subgraph.numActiveNodes < THRESHOLD_THREAD) + numThreads = 1; + + thread runThreads[numThreads]; + for(uint t=0; t < numThreads; t++) + { + + runThreads[t] = thread(dynamic, + t, + numThreads, + subgraph.numActiveNodes, + subgraph.activeNodes, + graph.outDegree, + subgraph.activeNodesPointer, + graph.nodePointer, + subgraph.activeEdgeList, + graph.edgeList); + + } + for(uint t=0; t elapsed_seconds_dync = finishDynC-startDynC; + //std::time_t finish_time_dync = std::chrono::system_clock::to_time_t(finishDynC); + //std::cout << "Dynamic CPU Time = " << elapsed_seconds_dync.count() << std::endl; } @@ -208,80 +189,64 @@ void SubgraphGenerator::generate(Graph &graph, Subgraph &subgraph) template void SubgraphGenerator::generate(GraphPR &graph, Subgraph &subgraph, float acc) { - //std::chrono::time_point startDynG, finishDynG; - //startDynG = std::chrono::system_clock::now(); - - prePrefix<<>>(d_activeNodesLabeling, d_activeNodesDegree, graph.d_outDegree, graph.d_delta, graph.num_nodes, acc); - - thrust::device_ptr ptr_labeling(d_activeNodesLabeling); - thrust::device_ptr ptr_labeling_prefixsum(d_prefixLabeling); - - subgraph.numActiveNodes = thrust::reduce(ptr_labeling, ptr_labeling + graph.num_nodes); - //cout << "Number of Active Nodes = " << subgraph.numActiveNodes << endl; - - thrust::exclusive_scan(ptr_labeling, ptr_labeling + graph.num_nodes, ptr_labeling_prefixsum); - - makeQueue<<>>(subgraph.d_activeNodes, d_activeNodesLabeling, d_prefixLabeling, graph.num_nodes); - - gpuErrorcheck(cudaMemcpy(subgraph.activeNodes, subgraph.d_activeNodes, subgraph.numActiveNodes*sizeof(unsigned int), cudaMemcpyDeviceToHost)); - - thrust::device_ptr ptr_degrees(d_activeNodesDegree); - thrust::device_ptr ptr_degrees_prefixsum(d_prefixSumDegrees); - - thrust::exclusive_scan(ptr_degrees, ptr_degrees + graph.num_nodes, ptr_degrees_prefixsum); - - makeActiveNodesPointer<<>>(subgraph.d_activeNodesPointer, d_activeNodesLabeling, d_prefixLabeling, d_prefixSumDegrees, graph.num_nodes); - gpuErrorcheck(cudaMemcpy(subgraph.activeNodesPointer, subgraph.d_activeNodesPointer, subgraph.numActiveNodes*sizeof(unsigned int), cudaMemcpyDeviceToHost)); - - unsigned int numActiveEdges = 0; - if(subgraph.numActiveNodes>0) - numActiveEdges = subgraph.activeNodesPointer[subgraph.numActiveNodes-1] + graph.outDegree[subgraph.activeNodes[subgraph.numActiveNodes-1]]; - - unsigned int last = numActiveEdges; - gpuErrorcheck(cudaMemcpy(subgraph.d_activeNodesPointer+subgraph.numActiveNodes, &last, sizeof(unsigned int), cudaMemcpyHostToDevice)); - - gpuErrorcheck(cudaMemcpy(subgraph.activeNodesPointer, subgraph.d_activeNodesPointer, (subgraph.numActiveNodes+1)*sizeof(unsigned int), cudaMemcpyDeviceToHost)); - - - //finishDynG = std::chrono::system_clock::now(); - //std::chrono::duration elapsed_seconds_dyng = finishDynG-startDynG; - //std::time_t finish_time_dyng = std::chrono::system_clock::to_time_t(finishDynG); - //std::cout << "Dynamic GPU Time = " << elapsed_seconds_dyng.count() << std::endl; - - //td::chrono::time_point startDynC, finishDynC; - //startDynC = std::chrono::system_clock::now(); - - unsigned int numThreads = NUM_THREADS; - - if(subgraph.numActiveNodes < THRESHOLD_THREAD) - numThreads = 1; - - thread runThreads[numThreads]; - - for(unsigned int t=0; t, - t, - numThreads, - subgraph.numActiveNodes, - subgraph.activeNodes, - graph.outDegree, - subgraph.activeNodesPointer, - graph.nodePointer, - subgraph.activeEdgeList, - graph.edgeList); - - } - - for(unsigned int t=0; t elapsed_seconds_dync = finishDynC-startDynC; - //std::time_t finish_time_dync = std::chrono::system_clock::to_time_t(finishDynC); - //std::cout << "Dynamic CPU Time = " << elapsed_seconds_dync.count() << std::endl; - + //std::chrono::time_point startDynG, finishDynG; + //startDynG = std::chrono::system_clock::now(); + prePrefix<<>>(d_activeNodesLabeling, d_activeNodesDegree, graph.d_outDegree, graph.d_delta, graph.num_nodes, acc); + thrust::device_ptr ptr_labeling(d_activeNodesLabeling); + thrust::device_ptr ptr_labeling_prefixsum(d_prefixLabeling); + subgraph.numActiveNodes = thrust::reduce(ptr_labeling, ptr_labeling + graph.num_nodes); + //cout << "Number of Active Nodes = " << subgraph.numActiveNodes << endl; + thrust::exclusive_scan(ptr_labeling, ptr_labeling + graph.num_nodes, ptr_labeling_prefixsum); + makeQueue<<>>(subgraph.d_activeNodes, d_activeNodesLabeling, d_prefixLabeling, graph.num_nodes); + + + ull n = subgraph.numActiveNodes; + + gpuErrorcheck(cudaMemcpy(subgraph.activeNodes, subgraph.d_activeNodes, n*sizeof(uint), cudaMemcpyDeviceToHost)); + thrust::device_ptr ptr_degrees(d_activeNodesDegree); + thrust::device_ptr ptr_degrees_prefixsum(d_prefixSumDegrees); + thrust::exclusive_scan(ptr_degrees, ptr_degrees + graph.num_nodes, ptr_degrees_prefixsum); + makeActiveNodesPointer<<>>(subgraph.d_activeNodesPointer, d_activeNodesLabeling, d_prefixLabeling, d_prefixSumDegrees, graph.num_nodes); + gpuErrorcheck(cudaMemcpy(subgraph.activeNodesPointer, subgraph.d_activeNodesPointer, n*sizeof(ull), cudaMemcpyDeviceToHost)); + ull numActiveEdges = 0; + if(subgraph.numActiveNodes>0) + numActiveEdges = subgraph.activeNodesPointer[subgraph.numActiveNodes-1] + graph.outDegree[subgraph.activeNodes[subgraph.numActiveNodes-1]]; + ull last = numActiveEdges; + gpuErrorcheck(cudaMemcpy(subgraph.d_activeNodesPointer+subgraph.numActiveNodes, &last, sizeof(ull), cudaMemcpyHostToDevice)); + gpuErrorcheck(cudaMemcpy(subgraph.activeNodesPointer, subgraph.d_activeNodesPointer, (n+1)*sizeof(ull), cudaMemcpyDeviceToHost)); + //finishDynG = std::chrono::system_clock::now(); + //std::chrono::duration elapsed_seconds_dyng = finishDynG-startDynG; + //std::time_t finish_time_dyng = std::chrono::system_clock::to_time_t(finishDynG); + //std::cout << "Dynamic GPU Time = " << elapsed_seconds_dyng.count() << std::endl; + //td::chrono::time_point startDynC, finishDynC; + //startDynC = std::chrono::system_clock::now(); + uint numThreads = NUM_THREADS; + + if(subgraph.numActiveNodes < THRESHOLD_THREAD) + numThreads = 1; + + thread runThreads[numThreads]; + for(uint t=0; t, + t, + numThreads, + subgraph.numActiveNodes, + subgraph.activeNodes, + graph.outDegree, + subgraph.activeNodesPointer, + graph.nodePointer, + subgraph.activeEdgeList, + graph.edgeList); + + } + for(uint t=0; t elapsed_seconds_dync = finishDynC-startDynC; + //std::time_t finish_time_dync = std::chrono::system_clock::to_time_t(finishDynC); + //std::cout << "Dynamic CPU Time = " << elapsed_seconds_dync.count() << std::endl; } template class SubgraphGenerator; diff --git a/shared/subgraph_generator.cuh b/shared/subgraph_generator.cuh index 9f2f9c6..bf7e364 100644 --- a/shared/subgraph_generator.cuh +++ b/shared/subgraph_generator.cuh @@ -15,21 +15,21 @@ class SubgraphGenerator private: public: - unsigned int *activeNodesLabeling; - unsigned int *activeNodesDegree; - unsigned int *prefixLabeling; - unsigned int *prefixSumDegrees; - unsigned int *d_activeNodesLabeling; - unsigned int *d_activeNodesDegree; - unsigned int *d_prefixLabeling; - unsigned int *d_prefixSumDegrees; - SubgraphGenerator(Graph &graph); - SubgraphGenerator(GraphPR &graph); - void generate(Graph &graph, Subgraph &subgraph); - void generate(GraphPR &graph, Subgraph &subgraph, float acc); + uint *activeNodesLabeling; + uint *activeNodesDegree; + uint *prefixLabeling; + ull *prefixSumDegrees; + uint *d_activeNodesLabeling; + uint *d_activeNodesDegree; + uint *d_prefixLabeling; + ull *d_prefixSumDegrees; + SubgraphGenerator(Graph &graph); + SubgraphGenerator(GraphPR &graph); + void generate(Graph &graph, Subgraph &subgraph); + void generate(GraphPR &graph, Subgraph &subgraph, float acc); }; -#endif // SUBGRAPH_GENERATOR_HPP +#endif // SUBGRAPH_GENERATOR_HPP diff --git a/shared/subway_utilities.cpp b/shared/subway_utilities.cpp index 605f333..ebd36e3 100644 --- a/shared/subway_utilities.cpp +++ b/shared/subway_utilities.cpp @@ -1,72 +1,71 @@ - #include "subway_utilities.hpp" void utilities::PrintResults(uint *results, uint n) { - cout << "Results of first "<< n << " nodes:\n["; - for(int i=0; i0) - cout << " "; - cout << i << ":" << results[i]; - } - cout << "]\n"; + cout << "Results of first "<< n << " nodes:\n["; + for(int i=0; i0) + cout << " "; + cout << i << ":" << results[i]; + } + cout << "]\n"; } void utilities::PrintResults(float *results, uint n) { - cout << "Results of first "<< n << " nodes:\n["; - for(int i=0; i0) - cout << " "; - cout << i << ":" << results[i]; - } - cout << "]\n"; + cout << "Results of first "<< n << " nodes:\n["; + for(int i=0; i0) + cout << " "; + cout << i << ":" << results[i]; + } + cout << "]\n"; } void utilities::PrintResults(double *results, uint n) { - cout << "Results of first "<< n << " nodes:\n["; - for(int i=0; i0) - cout << " "; - cout << i << ":" << results[i]; - } - cout << "]\n"; + cout << "Results of first "<< n << " nodes:\n["; + for(int i=0; i0) + cout << " "; + cout << i << ":" << results[i]; + } + cout << "]\n"; } void utilities::SaveResults(string filepath, uint *results, uint n) { - cout << "Saving the results into the following file:\n"; - cout << ">> " << filepath << endl; - ofstream outfile; - outfile.open(filepath); - for(int i=0; i> " << filepath << endl; + ofstream outfile; + outfile.open(filepath); + for(int i=0; i Test::Test() { - this->a = 1; - this->b = 1; + this->a = 1; + this->b = 1; } template int Test::sum(int a, int b) { - return a + b; + return a + b; } diff --git a/shared/test.cuh b/shared/test.cuh index 1158070..563c964 100644 --- a/shared/test.cuh +++ b/shared/test.cuh @@ -7,10 +7,10 @@ class Test private: public: - int a; - int b; + int a; + int b; Test(); int sum(int a, int b); }; -#endif // TEST_HPP +#endif // TEST_HPP diff --git a/shared/timer.cpp b/shared/timer.cpp index a79e65c..3442c47 100644 --- a/shared/timer.cpp +++ b/shared/timer.cpp @@ -4,20 +4,20 @@ void Timer::Start() { - //A = chrono::system_clock::now(); - gettimeofday( &StartingTime, NULL ); + //A = chrono::system_clock::now(); + gettimeofday( &StartingTime, NULL ); } float Timer::Finish() { - //B = std::chrono::system_clock::now(); - //chrono::duration elapsed_seconds = B - A; - //time_t finish_time = std::chrono::system_clock::to_time_t(B); - //cout << "title" << elapsed_seconds.count()*1000; - timeval PausingTime, ElapsedTime; - gettimeofday( &PausingTime, NULL ); - timersub(&PausingTime, &StartingTime, &ElapsedTime); - float d = ElapsedTime.tv_sec*1000.0+ElapsedTime.tv_usec/1000.0; - return d; + //B = std::chrono::system_clock::now(); + //chrono::duration elapsed_seconds = B - A; + //time_t finish_time = std::chrono::system_clock::to_time_t(B); + //cout << "title" << elapsed_seconds.count()*1000; + timeval PausingTime, ElapsedTime; + gettimeofday( &PausingTime, NULL ); + timersub(&PausingTime, &StartingTime, &ElapsedTime); + float d = ElapsedTime.tv_sec*1000.0+ElapsedTime.tv_usec/1000.0; + return d; } diff --git a/shared/timer.hpp b/shared/timer.hpp index ff324d1..f069172 100644 --- a/shared/timer.hpp +++ b/shared/timer.hpp @@ -10,14 +10,14 @@ class Timer { private: - //chrono::time_point A, B; - timeval StartingTime; + //chrono::time_point A, B; + timeval StartingTime; public: void Start(); float Finish(); }; -#endif // TIMER_HPP +#endif // TIMER_HPP diff --git a/subway/Makefile b/subway/Makefile index f56a951..bc4af04 100644 --- a/subway/Makefile +++ b/subway/Makefile @@ -1,8 +1,8 @@ CC=g++ NC=nvcc -CFLAGS=-std=c++11 -O3 -NFLAGS=-arch=sm_60 +CFLAGS=-std=c++14 -O3 +#NFLAGS=-arch=sm_80 SHARED=../shared diff --git a/subway/bfs-async.cu b/subway/bfs-async.cu index 07bad99..d44e23b 100644 --- a/subway/bfs-async.cu +++ b/subway/bfs-async.cu @@ -12,112 +12,89 @@ int main(int argc, char** argv) { - cudaFree(0); + cudaFree(0); - ArgumentParser arguments(argc, argv, true, false); - - Timer timer; - timer.Start(); - - Graph graph(arguments.input, false); - graph.ReadGraph(); - - float readtime = timer.Finish(); - cout << "Graph Reading finished in " << readtime/1000 << " (s).\n"; - - for(unsigned int i=0; i graph(arguments.input, false); + graph.ReadGraph(); + float readtime = timer.Finish(); + cout << "Graph Reading finished in " << readtime/1000 << " (s).\n"; + for(uint i=0; i subgraph(graph.num_nodes, graph.num_edges); - - SubgraphGenerator subgen(graph); - - subgen.generate(graph, subgraph); - - for(unsigned int i=0; i subgraph(graph.num_nodes, graph.num_edges); + SubgraphGenerator subgen(graph); + subgen.generate(graph, subgraph); + for(uint i=0; i partitioner; - - timer.Start(); - - unsigned int gItr = 0; - - bool finished; - bool *d_finished; - gpuErrorcheck(cudaMalloc(&d_finished, sizeof(bool))); - - while (subgraph.numActiveNodes>0) - { - gItr++; - - partitioner.partition(subgraph, subgraph.numActiveNodes); - // a super iteration - for(int i=0; i partitioner; + timer.Start(); + uint gItr = 0; + bool finished; + bool *d_finished; + gpuErrorcheck(cudaMalloc(&d_finished, sizeof(bool))); + while (subgraph.numActiveNodes>0) + { + gItr++; + partitioner.partition(subgraph, subgraph.numActiveNodes); + // a super iteration + for(int i=0; i>>(subgraph.d_activeNodes, graph.d_label, partitioner.partitionNodeSize[i], partitioner.fromNode[i]); - mixLabels<<>>(subgraph.d_activeNodes, graph.d_label1, graph.d_label2, partitioner.partitionNodeSize[i], partitioner.fromNode[i]); - - uint itr = 0; - do - { - itr++; - finished = true; - gpuErrorcheck(cudaMemcpy(d_finished, &finished, sizeof(bool), cudaMemcpyHostToDevice)); - - bfs_async<<< partitioner.partitionNodeSize[i]/512 + 1 , 512 >>>(partitioner.partitionNodeSize[i], - partitioner.fromNode[i], - partitioner.fromEdge[i], - subgraph.d_activeNodes, - subgraph.d_activeNodesPointer, - subgraph.d_activeEdgeList, - graph.d_outDegree, - graph.d_value, - d_finished, - (itr%2==1) ? graph.d_label1 : graph.d_label2, - (itr%2==1) ? graph.d_label2 : graph.d_label1); + //moveUpLabels<<< partitioner.partitionNodeSize[i]/512 + 1 , 512 >>>(subgraph.d_activeNodes, graph.d_label, partitioner.partitionNodeSize[i], partitioner.fromNode[i]); + mixLabels<<>>(subgraph.d_activeNodes, graph.d_label1, graph.d_label2, partitioner.partitionNodeSize[i], partitioner.fromNode[i]); + uint itr = 0; + do + { + itr++; + finished = true; + gpuErrorcheck(cudaMemcpy(d_finished, &finished, sizeof(bool), cudaMemcpyHostToDevice)); + bfs_async<<< partitioner.partitionNodeSize[i]/512 + 1 , 512 >>>(partitioner.partitionNodeSize[i], + partitioner.fromNode[i], + partitioner.fromEdge[i], + subgraph.d_activeNodes, + subgraph.d_activeNodesPointer, + subgraph.d_activeEdgeList, + graph.d_outDegree, + graph.d_value, + d_finished, + (itr%2==1) ? graph.d_label1 : graph.d_label2, + (itr%2==1) ? graph.d_label2 : graph.d_label1); - cudaDeviceSynchronize(); - gpuErrorcheck( cudaPeekAtLastError() ); - - gpuErrorcheck(cudaMemcpy(&finished, d_finished, sizeof(bool), cudaMemcpyDeviceToHost)); - }while(!(finished)); - - cout << itr << ((itr>1) ? " Inner Iterations" : " Inner Iteration") << " in Global Iteration " << gItr << ", Partition " << i << endl; - } - - subgen.generate(graph, subgraph); - - } - - float runtime = timer.Finish(); - cout << "Processing finished in " << runtime/1000 << " (s).\n"; - - gpuErrorcheck(cudaMemcpy(graph.value, graph.d_value, graph.num_nodes*sizeof(uint), cudaMemcpyDeviceToHost)); - - utilities::PrintResults(graph.value, min(30, graph.num_nodes)); - - if(arguments.hasOutput) - utilities::SaveResults(arguments.output, graph.value, graph.num_nodes); + cudaDeviceSynchronize(); + gpuErrorcheck( cudaPeekAtLastError() ); + gpuErrorcheck(cudaMemcpy(&finished, d_finished, sizeof(bool), cudaMemcpyDeviceToHost)); + }while(!(finished)); + cout << itr << ((itr>1) ? " Inner Iterations" : " Inner Iteration") << " in Global Iteration " << gItr << ", Partition " << i << endl; + } + subgen.generate(graph, subgraph); + } + float runtime = timer.Finish(); + cout << "Processing finished in " << runtime/1000 << " (s).\n"; + gpuErrorcheck(cudaMemcpy(graph.value, graph.d_value, graph.num_nodes*sizeof(uint), cudaMemcpyDeviceToHost)); + utilities::PrintResults(graph.value, min(30, graph.num_nodes)); + if(arguments.hasOutput) + utilities::SaveResults(arguments.output, graph.value, graph.num_nodes); } diff --git a/subway/bfs-sync.cu b/subway/bfs-sync.cu index 68a5610..d5e6bdf 100644 --- a/subway/bfs-sync.cu +++ b/subway/bfs-sync.cu @@ -12,92 +12,79 @@ int main(int argc, char** argv) { - cudaFree(0); + cudaFree(0); - ArgumentParser arguments(argc, argv, true, false); - - Timer timer; - timer.Start(); - - Graph graph(arguments.input, false); - graph.ReadGraph(); - - float readtime = timer.Finish(); - cout << "Graph Reading finished in " << readtime/1000 << " (s).\n"; - - for(unsigned int i=0; i graph(arguments.input, false); + graph.ReadGraph(); + + float readtime = timer.Finish(); + cout << "Graph Reading finished in " << readtime/1000 << " (s).\n"; + + for(uint i=0; i subgraph(graph.num_nodes, graph.num_edges); - - SubgraphGenerator subgen(graph); - - subgen.generate(graph, subgraph); + gpuErrorcheck(cudaMemcpy(graph.d_outDegree, graph.outDegree, graph.num_nodes * sizeof(uint), cudaMemcpyHostToDevice)); + gpuErrorcheck(cudaMemcpy(graph.d_value, graph.value, graph.num_nodes * sizeof(uint), cudaMemcpyHostToDevice)); + gpuErrorcheck(cudaMemcpy(graph.d_label1, graph.label1, graph.num_nodes * sizeof(bool), cudaMemcpyHostToDevice)); + gpuErrorcheck(cudaMemcpy(graph.d_label2, graph.label2, graph.num_nodes * sizeof(bool), cudaMemcpyHostToDevice)); + + Subgraph subgraph(graph.num_nodes, graph.num_edges); + SubgraphGenerator subgen(graph); + subgen.generate(graph, subgraph); - Partitioner partitioner; - - timer.Start(); - - uint itr = 0; - - while (subgraph.numActiveNodes>0) - { - itr++; - - partitioner.partition(subgraph, subgraph.numActiveNodes); - // a super iteration - for(int i=0; i partitioner; + timer.Start(); + uint itr = 0; + while (subgraph.numActiveNodes>0) + { + itr++; + partitioner.partition(subgraph, subgraph.numActiveNodes); + // a super iteration + for(int i=0; i>>(subgraph.d_activeNodes, graph.d_label1, graph.d_label2, partitioner.partitionNodeSize[i], partitioner.fromNode[i]); + moveUpLabels<<< partitioner.partitionNodeSize[i]/512 + 1 , 512 >>>(subgraph.d_activeNodes, graph.d_label1, graph.d_label2, partitioner.partitionNodeSize[i], partitioner.fromNode[i]); - bfs_kernel<<< partitioner.partitionNodeSize[i]/512 + 1 , 512 >>>(partitioner.partitionNodeSize[i], - partitioner.fromNode[i], - partitioner.fromEdge[i], - subgraph.d_activeNodes, - subgraph.d_activeNodesPointer, - subgraph.d_activeEdgeList, - graph.d_outDegree, - graph.d_value, - //d_finished, - graph.d_label1, - graph.d_label2); + bfs_kernel<<< partitioner.partitionNodeSize[i]/512 + 1 , 512 >>>(partitioner.partitionNodeSize[i], + partitioner.fromNode[i], + partitioner.fromEdge[i], + subgraph.d_activeNodes, + subgraph.d_activeNodesPointer, + subgraph.d_activeEdgeList, + graph.d_outDegree, + graph.d_value, + //d_finished, + graph.d_label1, + graph.d_label2); - cudaDeviceSynchronize(); - gpuErrorcheck( cudaPeekAtLastError() ); - } - - subgen.generate(graph, subgraph); - - } - - float runtime = timer.Finish(); - cout << "Processing finished in " << runtime/1000 << " (s).\n"; - - cout << "Number of iterations = " << itr << endl; - - gpuErrorcheck(cudaMemcpy(graph.value, graph.d_value, graph.num_nodes*sizeof(uint), cudaMemcpyDeviceToHost)); - - utilities::PrintResults(graph.value, min(30, graph.num_nodes)); - - if(arguments.hasOutput) - utilities::SaveResults(arguments.output, graph.value, graph.num_nodes); + cudaDeviceSynchronize(); + gpuErrorcheck( cudaPeekAtLastError() ); + } + subgen.generate(graph, subgraph); + } + float runtime = timer.Finish(); + cout << "Processing finished in " << runtime/1000 << " (s).\n"; + cout << "Number of iterations = " << itr << endl; + gpuErrorcheck(cudaMemcpy(graph.value, graph.d_value, graph.num_nodes*sizeof(uint), cudaMemcpyDeviceToHost)); + utilities::PrintResults(graph.value, min(30, graph.num_nodes)); + if(arguments.hasOutput) + utilities::SaveResults(arguments.output, graph.value, graph.num_nodes); } diff --git a/subway/cc-async.cu b/subway/cc-async.cu index 676b103..531726c 100644 --- a/subway/cc-async.cu +++ b/subway/cc-async.cu @@ -12,104 +12,84 @@ int main(int argc, char** argv) { - cudaFree(0); + cudaFree(0); - ArgumentParser arguments(argc, argv, true, false); - - Timer timer; - timer.Start(); - - Graph graph(arguments.input, false); - graph.ReadGraph(); - - float readtime = timer.Finish(); - cout << "Graph Reading finished in " << readtime/1000 << " (s).\n"; - - for(unsigned int i=0; i graph(arguments.input, false); + graph.ReadGraph(); + float readtime = timer.Finish(); + cout << "Graph Reading finished in " << readtime/1000 << " (s).\n"; + for(uint i=0; i subgraph(graph.num_nodes, graph.num_edges); - - SubgraphGenerator subgen(graph); - - subgen.generate(graph, subgraph); + ull n = graph.num_nodes; + gpuErrorcheck(cudaMemcpy(graph.d_outDegree, graph.outDegree, n * sizeof(uint), cudaMemcpyHostToDevice)); + gpuErrorcheck(cudaMemcpy(graph.d_value, graph.value, n * sizeof(uint), cudaMemcpyHostToDevice)); + gpuErrorcheck(cudaMemcpy(graph.d_label1, graph.label1, n * sizeof(bool), cudaMemcpyHostToDevice)); + gpuErrorcheck(cudaMemcpy(graph.d_label2, graph.label2, n * sizeof(bool), cudaMemcpyHostToDevice)); + + Subgraph subgraph(graph.num_nodes, graph.num_edges); + SubgraphGenerator subgen(graph); + subgen.generate(graph, subgraph); - Partitioner partitioner; - - timer.Start(); - - unsigned int gItr = 0; - - bool finished; - bool *d_finished; - gpuErrorcheck(cudaMalloc(&d_finished, sizeof(bool))); - - while (subgraph.numActiveNodes>0) - { - gItr++; - - partitioner.partition(subgraph, subgraph.numActiveNodes); - // a super iteration - for(int i=0; i partitioner; + timer.Start(); + uint gItr = 0; + bool finished; + bool *d_finished; + gpuErrorcheck(cudaMalloc(&d_finished, sizeof(bool))); + while (subgraph.numActiveNodes>0) + { + gItr++; + partitioner.partition(subgraph, subgraph.numActiveNodes); + // a super iteration + for(int i=0; i>>(subgraph.d_activeNodes, graph.d_label, partitioner.partitionNodeSize[i], partitioner.fromNode[i]); - mixLabels<<>>(subgraph.d_activeNodes, graph.d_label1, graph.d_label2, partitioner.partitionNodeSize[i], partitioner.fromNode[i]); - - uint itr = 0; - do - { - itr++; - finished = true; - gpuErrorcheck(cudaMemcpy(d_finished, &finished, sizeof(bool), cudaMemcpyHostToDevice)); - - cc_async<<< partitioner.partitionNodeSize[i]/512 + 1 , 512 >>>(partitioner.partitionNodeSize[i], - partitioner.fromNode[i], - partitioner.fromEdge[i], - subgraph.d_activeNodes, - subgraph.d_activeNodesPointer, - subgraph.d_activeEdgeList, - graph.d_outDegree, - graph.d_value, - d_finished, - (itr%2==1) ? graph.d_label1 : graph.d_label2, - (itr%2==1) ? graph.d_label2 : graph.d_label1); + //moveUpLabels<<< partitioner.partitionNodeSize[i]/512 + 1 , 512 >>>(subgraph.d_activeNodes, graph.d_label, partitioner.partitionNodeSize[i], partitioner.fromNode[i]); + mixLabels<<>>(subgraph.d_activeNodes, graph.d_label1, graph.d_label2, partitioner.partitionNodeSize[i], partitioner.fromNode[i]); + uint itr = 0; + do + { + itr++; + finished = true; + gpuErrorcheck(cudaMemcpy(d_finished, &finished, sizeof(bool), cudaMemcpyHostToDevice)); + cc_async<<< partitioner.partitionNodeSize[i]/512 + 1 , 512 >>>(partitioner.partitionNodeSize[i], + partitioner.fromNode[i], + partitioner.fromEdge[i], + subgraph.d_activeNodes, + subgraph.d_activeNodesPointer, + subgraph.d_activeEdgeList, + graph.d_outDegree, + graph.d_value, + d_finished, + (itr%2==1) ? graph.d_label1 : graph.d_label2, + (itr%2==1) ? graph.d_label2 : graph.d_label1); - cudaDeviceSynchronize(); - gpuErrorcheck( cudaPeekAtLastError() ); - - gpuErrorcheck(cudaMemcpy(&finished, d_finished, sizeof(bool), cudaMemcpyDeviceToHost)); - }while(!(finished)); - - cout << itr << ((itr>1) ? " Inner Iterations" : " Inner Iteration") << " in Global Iteration " << gItr << ", Partition " << i << endl; - } - - subgen.generate(graph, subgraph); - - } - - float runtime = timer.Finish(); - cout << "Processing finished in " << runtime/1000 << " (s).\n"; - - gpuErrorcheck(cudaMemcpy(graph.value, graph.d_value, graph.num_nodes*sizeof(uint), cudaMemcpyDeviceToHost)); - - utilities::PrintResults(graph.value, min(30, graph.num_nodes)); - - if(arguments.hasOutput) - utilities::SaveResults(arguments.output, graph.value, graph.num_nodes); + cudaDeviceSynchronize(); + gpuErrorcheck( cudaPeekAtLastError() ); + gpuErrorcheck(cudaMemcpy(&finished, d_finished, sizeof(bool), cudaMemcpyDeviceToHost)); + }while(!(finished)); + cout << itr << ((itr>1) ? " Inner Iterations" : " Inner Iteration") << " in Global Iteration " << gItr << ", Partition " << i << endl; + } + subgen.generate(graph, subgraph); + } + float runtime = timer.Finish(); + cout << "Processing finished in " << runtime/1000 << " (s).\n"; + gpuErrorcheck(cudaMemcpy(graph.value, graph.d_value, n*sizeof(uint), cudaMemcpyDeviceToHost)); + utilities::PrintResults(graph.value, min(30, graph.num_nodes)); + if(arguments.hasOutput) + utilities::SaveResults(arguments.output, graph.value, graph.num_nodes); } diff --git a/subway/cc-sync.cu b/subway/cc-sync.cu index 2d40951..f485d4a 100644 --- a/subway/cc-sync.cu +++ b/subway/cc-sync.cu @@ -12,89 +12,84 @@ int main(int argc, char** argv) { - cudaFree(0); - - ArgumentParser arguments(argc, argv, false, false); - - Timer timer; - timer.Start(); - - Graph graph(arguments.input, false); - graph.ReadGraph(); - - float readtime = timer.Finish(); - cout << "Graph Reading finished in " << readtime/1000 << " (s).\n"; - - - for(unsigned int i=0; i subgraph(graph.num_nodes, graph.num_edges); - - SubgraphGenerator subgen(graph); - - subgen.generate(graph, subgraph); - - - Partitioner partitioner; - - timer.Start(); - - uint itr = 0; - - while (subgraph.numActiveNodes>0) - { - itr++; - - partitioner.partition(subgraph, subgraph.numActiveNodes); - // a super iteration - for(int i=0; i>>(subgraph.d_activeNodes, graph.d_label1, graph.d_label2, partitioner.partitionNodeSize[i], partitioner.fromNode[i]); - - cc_kernel<<< partitioner.partitionNodeSize[i]/512 + 1 , 512 >>>(partitioner.partitionNodeSize[i], - partitioner.fromNode[i], - partitioner.fromEdge[i], - subgraph.d_activeNodes, - subgraph.d_activeNodesPointer, - subgraph.d_activeEdgeList, - graph.d_outDegree, - graph.d_value, - //d_finished, - graph.d_label1, - graph.d_label2); - - cudaDeviceSynchronize(); - gpuErrorcheck( cudaPeekAtLastError() ); - } - - subgen.generate(graph, subgraph); - - } - - float runtime = timer.Finish(); - cout << "Processing finished in " << runtime/1000 << " (s).\n"; - - cout << "Number of iterations = " << itr << endl; - - gpuErrorcheck(cudaMemcpy(graph.value, graph.d_value, graph.num_nodes*sizeof(uint), cudaMemcpyDeviceToHost)); - - utilities::PrintResults(graph.value, min(30, graph.num_nodes)); - - if(arguments.hasOutput) - utilities::SaveResults(arguments.output, graph.value, graph.num_nodes); + cudaFree(0); + + ArgumentParser arguments(argc, argv, false, false); + Timer timer; + timer.Start(); + Graph graph(arguments.input, false); + graph.ReadGraph(); + float readtime = timer.Finish(); + cout << "Graph Reading finished in " << readtime/1000 << " (s).\n"; + + for(uint i=0; i < graph.num_nodes; i++) + { + graph.value[i] = i; + graph.label1[i] = false; + graph.label2[i] = true; + } + + ull n = graph.num_nodes; + gpuErrorcheck(cudaMemcpy(graph.d_outDegree, graph.outDegree, n * sizeof(uint), cudaMemcpyHostToDevice)); + gpuErrorcheck(cudaMemcpy(graph.d_value, graph.value, n * sizeof(uint), cudaMemcpyHostToDevice)); + gpuErrorcheck(cudaMemcpy(graph.d_label1, graph.label1, n * sizeof(bool), cudaMemcpyHostToDevice)); + gpuErrorcheck(cudaMemcpy(graph.d_label2, graph.label2, n * sizeof(bool), cudaMemcpyHostToDevice)); + Subgraph subgraph(graph.num_nodes, graph.num_edges); + + SubgraphGenerator subgen(graph); + + subgen.generate(graph, subgraph); + + Partitioner partitioner; + timer.Start(); + uint itr = 0; + + + while (subgraph.numActiveNodes>0) + { + std::cout << "number_of_active_nodes: " << subgraph.numActiveNodes << std::endl; + itr++; + partitioner.partition(subgraph, subgraph.numActiveNodes); + + + std::cout << "number of partitions: " << partitioner.numPartitions << std::endl; + gpuErrorcheck( cudaPeekAtLastError() ); + + + // a super iteration + for(int i=0; i>>(subgraph.d_activeNodes, graph.d_label1, graph.d_label2, partitioner.partitionNodeSize[i], partitioner.fromNode[i]); + cc_kernel<<< partitioner.partitionNodeSize[i]/512 + 1 , 512 >>>(partitioner.partitionNodeSize[i], + partitioner.fromNode[i], + partitioner.fromEdge[i], + subgraph.d_activeNodes, + subgraph.d_activeNodesPointer, + subgraph.d_activeEdgeList, + graph.d_outDegree, + graph.d_value, + //d_finished, + graph.d_label1, + graph.d_label2); + + cudaDeviceSynchronize(); + gpuErrorcheck( cudaPeekAtLastError() ); + } + subgen.generate(graph, subgraph); + } + float runtime = timer.Finish(); + cout << "Processing finished in " << runtime/1000 << " (s).\n"; + cout << "Number of iterations = " << itr << endl; + gpuErrorcheck(cudaMemcpy(graph.value, graph.d_value, n*sizeof(uint), cudaMemcpyDeviceToHost)); + utilities::PrintResults(graph.value, min(30, graph.num_nodes)); + if(arguments.hasOutput) + utilities::SaveResults(arguments.output, graph.value, graph.num_nodes); } diff --git a/subway/pr-async.cu b/subway/pr-async.cu index c5fe5e9..672eee5 100644 --- a/subway/pr-async.cu +++ b/subway/pr-async.cu @@ -14,112 +14,86 @@ int main(int argc, char** argv) { - - cudaFree(0); + cudaFree(0); - ArgumentParser arguments(argc, argv, true, false); - - Timer timer; - timer.Start(); - - GraphPR graph(arguments.input, true); - graph.ReadGraph(); - - float readtime = timer.Finish(); - cout << "Graph Reading finished in " << readtime/1000 << " (s).\n"; - - //for(unsigned int i=0; i<100; i++) - // cout << graph.edgeList[i].end << " " << graph.edgeList[i].w8; - - float initPR = 0.15; - float acc = 0.01; - - for(unsigned int i=0; i graph(arguments.input, true); + graph.ReadGraph(); + float readtime = timer.Finish(); + cout << "Graph Reading finished in " << readtime/1000 << " (s).\n"; + //for(unsigned int i=0; i<100; i++) + // cout << graph.edgeList[i].end << " " << graph.edgeList[i].w8; + float initPR = 0.15; + float acc = 0.01; + for(uint i=0; i subgraph(graph.num_nodes, graph.num_edges); + SubgraphGenerator subgen(graph); + subgen.generate(graph, subgraph, acc); - gpuErrorcheck(cudaMemcpy(graph.d_outDegree, graph.outDegree, graph.num_nodes * sizeof(unsigned int), cudaMemcpyHostToDevice)); - gpuErrorcheck(cudaMemcpy(graph.d_value, graph.value, graph.num_nodes * sizeof(float), cudaMemcpyHostToDevice)); - gpuErrorcheck(cudaMemcpy(graph.d_delta, graph.delta, graph.num_nodes * sizeof(float), cudaMemcpyHostToDevice)); - - Subgraph subgraph(graph.num_nodes, graph.num_edges); - - SubgraphGenerator subgen(graph); - - subgen.generate(graph, subgraph, acc); + Partitioner partitioner; + timer.Start(); + uint gItr = 0; + bool finished; + bool *d_finished; + gpuErrorcheck(cudaMalloc(&d_finished, sizeof(bool))); + while (subgraph.numActiveNodes>0) + { + gItr++; + partitioner.partition(subgraph, subgraph.numActiveNodes); + // a super iteration + for(int i=0; i partitioner; - - timer.Start(); - - uint gItr = 0; - - bool finished; - bool *d_finished; - gpuErrorcheck(cudaMalloc(&d_finished, sizeof(bool))); - - while (subgraph.numActiveNodes>0) - { - gItr++; - - partitioner.partition(subgraph, subgraph.numActiveNodes); - // a super iteration - for(int i=0; i>>(subgraph.d_activeNodes, graph.d_label, partitioner.partitionNodeSize[i], partitioner.fromNode[i]); + //mixLabels<<>>(subgraph.d_activeNodes, graph.d_label1, graph.d_label2, partitioner.partitionNodeSize[i], partitioner.fromNode[i]); + uint itr = 0; + do + { + itr++; + finished = true; + gpuErrorcheck(cudaMemcpy(d_finished, &finished, sizeof(bool), cudaMemcpyHostToDevice)); - //moveUpLabels<<< partitioner.partitionNodeSize[i]/512 + 1 , 512 >>>(subgraph.d_activeNodes, graph.d_label, partitioner.partitionNodeSize[i], partitioner.fromNode[i]); - //mixLabels<<>>(subgraph.d_activeNodes, graph.d_label1, graph.d_label2, partitioner.partitionNodeSize[i], partitioner.fromNode[i]); - - uint itr = 0; - do - { - itr++; - finished = true; - gpuErrorcheck(cudaMemcpy(d_finished, &finished, sizeof(bool), cudaMemcpyHostToDevice)); + pr_async<<< partitioner.partitionNodeSize[i]/512 + 1 , 512 >>>(partitioner.partitionNodeSize[i], + partitioner.fromNode[i], + partitioner.fromEdge[i], + subgraph.d_activeNodes, + subgraph.d_activeNodesPointer, + subgraph.d_activeEdgeList, + graph.d_outDegree, + graph.d_value, + graph.d_delta, + d_finished, + acc); - pr_async<<< partitioner.partitionNodeSize[i]/512 + 1 , 512 >>>(partitioner.partitionNodeSize[i], - partitioner.fromNode[i], - partitioner.fromEdge[i], - subgraph.d_activeNodes, - subgraph.d_activeNodesPointer, - subgraph.d_activeEdgeList, - graph.d_outDegree, - graph.d_value, - graph.d_delta, - d_finished, - acc); - - - cudaDeviceSynchronize(); - gpuErrorcheck( cudaPeekAtLastError() ); - - gpuErrorcheck(cudaMemcpy(&finished, d_finished, sizeof(bool), cudaMemcpyDeviceToHost)); - }while(!(finished)); - - cout << itr << ((itr>1) ? " Inner Iterations" : " Inner Iteration") << " in Global Iteration " << gItr << ", Partition " << i << endl; - } - - subgen.generate(graph, subgraph, acc); - - } - - float runtime = timer.Finish(); - cout << "Processing finished in " << runtime/1000 << " (s).\n"; - - gpuErrorcheck(cudaMemcpy(graph.value, graph.d_value, graph.num_nodes*sizeof(float), cudaMemcpyDeviceToHost)); - - utilities::PrintResults(graph.value, min(30, graph.num_nodes)); - - - if(arguments.hasOutput) - utilities::SaveResults(arguments.output, graph.value, graph.num_nodes); + cudaDeviceSynchronize(); + gpuErrorcheck( cudaPeekAtLastError() ); + gpuErrorcheck(cudaMemcpy(&finished, d_finished, sizeof(bool), cudaMemcpyDeviceToHost)); + }while(!(finished)); + cout << itr << ((itr>1) ? " Inner Iterations" : " Inner Iteration") << " in Global Iteration " << gItr << ", Partition " << i << endl; + } + subgen.generate(graph, subgraph, acc); + } + float runtime = timer.Finish(); + cout << "Processing finished in " << runtime/1000 << " (s).\n"; + gpuErrorcheck(cudaMemcpy(graph.value, graph.d_value, n*sizeof(float), cudaMemcpyDeviceToHost)); + utilities::PrintResults(graph.value, min(30, graph.num_nodes)); + if(arguments.hasOutput) + utilities::SaveResults(arguments.output, graph.value, graph.num_nodes); } diff --git a/subway/pr-sync.cu b/subway/pr-sync.cu index f520663..376bb28 100644 --- a/subway/pr-sync.cu +++ b/subway/pr-sync.cu @@ -13,97 +13,75 @@ int main(int argc, char** argv) -{ - cudaFree(0); +{ + cudaFree(0); - ArgumentParser arguments(argc, argv, true, false); - - Timer timer; - timer.Start(); - - GraphPR graph(arguments.input, true); - graph.ReadGraph(); - - float readtime = timer.Finish(); - cout << "Graph Reading finished in " << readtime/1000 << " (s).\n"; - - //for(unsigned int i=0; i<100; i++) - // cout << graph.edgeList[i].end << " " << graph.edgeList[i].w8; - - float initPR = 0.15; - float acc = 0.01; - - for(unsigned int i=0; i graph(arguments.input, true); + graph.ReadGraph(); + float readtime = timer.Finish(); + cout << "Graph Reading finished in " << readtime/1000 << " (s).\n"; + //for(unsigned int i=0; i<100; i++) + // cout << graph.edgeList[i].end << " " << graph.edgeList[i].w8; + float initPR = 0.15; + float acc = 0.01; + for(uint i=0; i subgraph(graph.num_nodes, graph.num_edges); - - SubgraphGenerator subgen(graph); - - subgen.generate(graph, subgraph, acc); + ull n = graph.num_nodes; + gpuErrorcheck(cudaMemcpy(graph.d_outDegree, graph.outDegree, n * sizeof(uint), cudaMemcpyHostToDevice)); + gpuErrorcheck(cudaMemcpy(graph.d_value, graph.value, n * sizeof(float), cudaMemcpyHostToDevice)); + gpuErrorcheck(cudaMemcpy(graph.d_delta, graph.delta, n * sizeof(float), cudaMemcpyHostToDevice)); + Subgraph subgraph(graph.num_nodes, graph.num_edges); + SubgraphGenerator subgen(graph); + subgen.generate(graph, subgraph, acc); - Partitioner partitioner; - - timer.Start(); - - uint gItr = 0; - - - while (subgraph.numActiveNodes>0) - { - gItr++; - - partitioner.partition(subgraph, subgraph.numActiveNodes); - // a super iteration - for(int i=0; i partitioner; + timer.Start(); + uint gItr = 0; + while (subgraph.numActiveNodes>0) + { + gItr++; + partitioner.partition(subgraph, subgraph.numActiveNodes); + // a super iteration + for(int i=0; i>>(partitioner.partitionNodeSize[i], - partitioner.fromNode[i], - partitioner.fromEdge[i], - subgraph.d_activeNodes, - subgraph.d_activeNodesPointer, - subgraph.d_activeEdgeList, - graph.d_outDegree, - graph.d_value, - graph.d_delta, - acc); + pr_kernel<<< partitioner.partitionNodeSize[i]/512 + 1 , 512 >>>(partitioner.partitionNodeSize[i], + partitioner.fromNode[i], + partitioner.fromEdge[i], + subgraph.d_activeNodes, + subgraph.d_activeNodesPointer, + subgraph.d_activeEdgeList, + graph.d_outDegree, + graph.d_value, + graph.d_delta, + acc); - cudaDeviceSynchronize(); - gpuErrorcheck( cudaPeekAtLastError() ); - - } - - subgen.generate(graph, subgraph, acc); - - } - - float runtime = timer.Finish(); - cout << "Processing finished in " << runtime/1000 << " (s).\n"; - - cout << "Number of iterations = " << gItr << endl; - - gpuErrorcheck(cudaMemcpy(graph.value, graph.d_value, graph.num_nodes*sizeof(float), cudaMemcpyDeviceToHost)); - - utilities::PrintResults(graph.value, min(30, graph.num_nodes)); + cudaDeviceSynchronize(); + gpuErrorcheck( cudaPeekAtLastError() ); + } + subgen.generate(graph, subgraph, acc); + } + float runtime = timer.Finish(); + cout << "Processing finished in " << runtime/1000 << " (s).\n"; + cout << "Number of iterations = " << gItr << endl; + gpuErrorcheck(cudaMemcpy(graph.value, graph.d_value, n*sizeof(float), cudaMemcpyDeviceToHost)); + utilities::PrintResults(graph.value, min(30, graph.num_nodes)); - - if(arguments.hasOutput) - utilities::SaveResults(arguments.output, graph.value, graph.num_nodes); + if(arguments.hasOutput) + utilities::SaveResults(arguments.output, graph.value, graph.num_nodes); } diff --git a/subway/sssp-async.cu b/subway/sssp-async.cu index 036b5b7..a9a6cc5 100644 --- a/subway/sssp-async.cu +++ b/subway/sssp-async.cu @@ -14,124 +14,98 @@ int main(int argc, char** argv) { - /* - Test test; - cout << test.sum(20, 30) << endl; - */ - - cudaFree(0); + /* + Test test; + cout << test.sum(20, 30) << endl; + */ + cudaFree(0); - ArgumentParser arguments(argc, argv, true, false); - - Timer timer; - timer.Start(); - - Graph graph(arguments.input, true); - graph.ReadGraph(); - - float readtime = timer.Finish(); - cout << "Graph Reading finished in " << readtime/1000 << " (s).\n"; - - //for(unsigned int i=0; i<100; i++) - // cout << graph.edgeList[i].end << " " << graph.edgeList[i].w8; - - for(unsigned int i=0; i graph(arguments.input, true); + graph.ReadGraph(); + float readtime = timer.Finish(); + cout << "Graph Reading finished in " << readtime/1000 << " (s).\n"; + //for(unsigned int i=0; i<100; i++) + // cout << graph.edgeList[i].end << " " << graph.edgeList[i].w8; + for(uint i=0; i subgraph(graph.num_nodes, graph.num_edges); + SubgraphGenerator subgen(graph); + subgen.generate(graph, subgraph); + for(uint i=0; i subgraph(graph.num_nodes, graph.num_edges); - - SubgraphGenerator subgen(graph); - - subgen.generate(graph, subgraph); - - for(unsigned int i=0; i partitioner; + timer.Start(); + uint gItr = 0; + bool finished; + bool *d_finished; + gpuErrorcheck(cudaMalloc(&d_finished, sizeof(bool))); + while (subgraph.numActiveNodes>0) + { + gItr++; + partitioner.partition(subgraph, subgraph.numActiveNodes); + // a super iteration + for(int i=0; i partitioner; - - timer.Start(); - - uint gItr = 0; - - bool finished; - bool *d_finished; - gpuErrorcheck(cudaMalloc(&d_finished, sizeof(bool))); - - while (subgraph.numActiveNodes>0) - { - gItr++; - - partitioner.partition(subgraph, subgraph.numActiveNodes); - // a super iteration - for(int i=0; i>>(subgraph.d_activeNodes, graph.d_label, partitioner.partitionNodeSize[i], partitioner.fromNode[i]); + mixLabels<<>>(subgraph.d_activeNodes, graph.d_label1, graph.d_label2, partitioner.partitionNodeSize[i], partitioner.fromNode[i]); + uint itr = 0; + do + { + itr++; + finished = true; + gpuErrorcheck(cudaMemcpy(d_finished, &finished, sizeof(bool), cudaMemcpyHostToDevice)); - //moveUpLabels<<< partitioner.partitionNodeSize[i]/512 + 1 , 512 >>>(subgraph.d_activeNodes, graph.d_label, partitioner.partitionNodeSize[i], partitioner.fromNode[i]); - mixLabels<<>>(subgraph.d_activeNodes, graph.d_label1, graph.d_label2, partitioner.partitionNodeSize[i], partitioner.fromNode[i]); - - uint itr = 0; - do - { - itr++; - finished = true; - gpuErrorcheck(cudaMemcpy(d_finished, &finished, sizeof(bool), cudaMemcpyHostToDevice)); + sssp_async<<< partitioner.partitionNodeSize[i]/512 + 1 , 512 >>>(partitioner.partitionNodeSize[i], + partitioner.fromNode[i], + partitioner.fromEdge[i], + subgraph.d_activeNodes, + subgraph.d_activeNodesPointer, + subgraph.d_activeEdgeList, + graph.d_outDegree, + graph.d_value, + d_finished, + (itr%2==1) ? graph.d_label1 : graph.d_label2, + (itr%2==1) ? graph.d_label2 : graph.d_label1); - sssp_async<<< partitioner.partitionNodeSize[i]/512 + 1 , 512 >>>(partitioner.partitionNodeSize[i], - partitioner.fromNode[i], - partitioner.fromEdge[i], - subgraph.d_activeNodes, - subgraph.d_activeNodesPointer, - subgraph.d_activeEdgeList, - graph.d_outDegree, - graph.d_value, - d_finished, - (itr%2==1) ? graph.d_label1 : graph.d_label2, - (itr%2==1) ? graph.d_label2 : graph.d_label1); - - cudaDeviceSynchronize(); - gpuErrorcheck( cudaPeekAtLastError() ); - - gpuErrorcheck(cudaMemcpy(&finished, d_finished, sizeof(bool), cudaMemcpyDeviceToHost)); - }while(!(finished)); - - cout << itr << ((itr>1) ? " Inner Iterations" : " Inner Iteration") << " in Global Iteration " << gItr << ", Partition " << i << endl; - } - - subgen.generate(graph, subgraph); - - } - - float runtime = timer.Finish(); - cout << "Processing finished in " << runtime/1000 << " (s).\n"; - - gpuErrorcheck(cudaMemcpy(graph.value, graph.d_value, graph.num_nodes*sizeof(uint), cudaMemcpyDeviceToHost)); - - utilities::PrintResults(graph.value, min(30, graph.num_nodes)); - - //for(int i=0; i<20; i++) - // cout << graph.value[i] << endl; - - if(arguments.hasOutput) - utilities::SaveResults(arguments.output, graph.value, graph.num_nodes); + cudaDeviceSynchronize(); + gpuErrorcheck( cudaPeekAtLastError() ); + gpuErrorcheck(cudaMemcpy(&finished, d_finished, sizeof(bool), cudaMemcpyDeviceToHost)); + }while(!(finished)); + cout << itr << ((itr>1) ? " Inner Iterations" : " Inner Iteration") << " in Global Iteration " << gItr << ", Partition " << i << endl; + } + subgen.generate(graph, subgraph); + } + float runtime = timer.Finish(); + cout << "Processing finished in " << runtime/1000 << " (s).\n"; + gpuErrorcheck(cudaMemcpy(graph.value, graph.d_value, n*sizeof(uint), cudaMemcpyDeviceToHost)); + utilities::PrintResults(graph.value, min(30, graph.num_nodes)); + //for(int i=0; i<20; i++) + // cout << graph.value[i] << endl; + if(arguments.hasOutput) + utilities::SaveResults(arguments.output, graph.value, graph.num_nodes); } diff --git a/subway/sssp-sync.cu b/subway/sssp-sync.cu index e4a7440..892a963 100644 --- a/subway/sssp-sync.cu +++ b/subway/sssp-sync.cu @@ -12,92 +12,75 @@ int main(int argc, char** argv) { - cudaFree(0); + cudaFree(0); - ArgumentParser arguments(argc, argv, true, false); - - Timer timer; - timer.Start(); - - Graph graph(arguments.input, true); - graph.ReadGraph(); - - float readtime = timer.Finish(); - cout << "Graph Reading finished in " << readtime/1000 << " (s).\n"; - - for(unsigned int i=0; i graph(arguments.input, true); + graph.ReadGraph(); + float readtime = timer.Finish(); + cout << "Graph Reading finished in " << readtime/1000 << " (s).\n"; + for(uint i=0; i subgraph(graph.num_nodes, graph.num_edges); - - SubgraphGenerator subgen(graph); - - subgen.generate(graph, subgraph); + ull n = graph.num_nodes; + gpuErrorcheck(cudaMemcpy(graph.d_outDegree, graph.outDegree, n * sizeof(uint), cudaMemcpyHostToDevice)); + gpuErrorcheck(cudaMemcpy(graph.d_value, graph.value, n * sizeof(uint), cudaMemcpyHostToDevice)); + gpuErrorcheck(cudaMemcpy(graph.d_label1, graph.label1, n * sizeof(bool), cudaMemcpyHostToDevice)); + gpuErrorcheck(cudaMemcpy(graph.d_label2, graph.label2, n * sizeof(bool), cudaMemcpyHostToDevice)); + Subgraph subgraph(graph.num_nodes, graph.num_edges); + SubgraphGenerator subgen(graph); + subgen.generate(graph, subgraph); - Partitioner partitioner; - - timer.Start(); - - uint itr = 0; - - while (subgraph.numActiveNodes>0) - { - itr++; - - partitioner.partition(subgraph, subgraph.numActiveNodes); - // a super iteration - for(int i=0; i partitioner; + timer.Start(); + uint itr = 0; + while (subgraph.numActiveNodes>0) + { + itr++; + partitioner.partition(subgraph, subgraph.numActiveNodes); + // a super iteration + for(int i=0; i>>(subgraph.d_activeNodes, graph.d_label1, graph.d_label2, partitioner.partitionNodeSize[i], partitioner.fromNode[i]); + moveUpLabels<<< partitioner.partitionNodeSize[i]/512 + 1 , 512 >>>(subgraph.d_activeNodes, graph.d_label1, graph.d_label2, partitioner.partitionNodeSize[i], partitioner.fromNode[i]); - sssp_kernel<<< partitioner.partitionNodeSize[i]/512 + 1 , 512 >>>(partitioner.partitionNodeSize[i], - partitioner.fromNode[i], - partitioner.fromEdge[i], - subgraph.d_activeNodes, - subgraph.d_activeNodesPointer, - subgraph.d_activeEdgeList, - graph.d_outDegree, - graph.d_value, - //d_finished, - graph.d_label1, - graph.d_label2); + sssp_kernel<<< partitioner.partitionNodeSize[i]/512 + 1 , 512 >>>(partitioner.partitionNodeSize[i], + partitioner.fromNode[i], + partitioner.fromEdge[i], + subgraph.d_activeNodes, + subgraph.d_activeNodesPointer, + subgraph.d_activeEdgeList, + graph.d_outDegree, + graph.d_value, + //d_finished, + graph.d_label1, + graph.d_label2); - cudaDeviceSynchronize(); - gpuErrorcheck( cudaPeekAtLastError() ); - } - - subgen.generate(graph, subgraph); - - } - - float runtime = timer.Finish(); - cout << "Processing finished in " << runtime << " (ms).\n"; - - cout << "Number of iterations = " << itr << endl; - - gpuErrorcheck(cudaMemcpy(graph.value, graph.d_value, graph.num_nodes*sizeof(uint), cudaMemcpyDeviceToHost)); - - utilities::PrintResults(graph.value, min(30, graph.num_nodes)); - - if(arguments.hasOutput) - utilities::SaveResults(arguments.output, graph.value, graph.num_nodes); + cudaDeviceSynchronize(); + gpuErrorcheck( cudaPeekAtLastError() ); + } + subgen.generate(graph, subgraph); + } + float runtime = timer.Finish(); + cout << "Processing finished in " << runtime << " (ms).\n"; + cout << "Number of iterations = " << itr << endl; + gpuErrorcheck(cudaMemcpy(graph.value, graph.d_value, n*sizeof(uint), cudaMemcpyDeviceToHost)); + utilities::PrintResults(graph.value, min(30, graph.num_nodes)); + if(arguments.hasOutput) + utilities::SaveResults(arguments.output, graph.value, graph.num_nodes); } diff --git a/subway/sswp-async.cu b/subway/sswp-async.cu index 9b8e230..6606d93 100644 --- a/subway/sswp-async.cu +++ b/subway/sswp-async.cu @@ -12,116 +12,92 @@ int main(int argc, char** argv) { - cudaFree(0); + cudaFree(0); - ArgumentParser arguments(argc, argv, true, false); - - Timer timer; - timer.Start(); - - Graph graph(arguments.input, true); - graph.ReadGraph(); - - float readtime = timer.Finish(); - cout << "Graph Reading finished in " << readtime/1000 << " (s).\n"; - - //for(unsigned int i=0; i<100; i++) - // cout << graph.edgeList[i].end << " " << graph.edgeList[i].w8; - - for(unsigned int i=0; i graph(arguments.input, true); + graph.ReadGraph(); + float readtime = timer.Finish(); + cout << "Graph Reading finished in " << readtime/1000 << " (s).\n"; + //for(unsigned int i=0; i<100; i++) + // cout << graph.edgeList[i].end << " " << graph.edgeList[i].w8; + for(uint i=0; i subgraph(graph.num_nodes, graph.num_edges); + SubgraphGenerator subgen(graph); + subgen.generate(graph, subgraph); + for(uint i=0; i subgraph(graph.num_nodes, graph.num_edges); - - SubgraphGenerator subgen(graph); - - subgen.generate(graph, subgraph); - - for(unsigned int i=0; i partitioner; + timer.Start(); + uint gItr = 0; + bool finished; + bool *d_finished; + gpuErrorcheck(cudaMalloc(&d_finished, sizeof(bool))); + while (subgraph.numActiveNodes>0) + { + gItr++; + partitioner.partition(subgraph, subgraph.numActiveNodes); + // a super iteration + for(int i=0; i partitioner; - - timer.Start(); - - uint gItr = 0; - - bool finished; - bool *d_finished; - gpuErrorcheck(cudaMalloc(&d_finished, sizeof(bool))); - - while (subgraph.numActiveNodes>0) - { - gItr++; - - partitioner.partition(subgraph, subgraph.numActiveNodes); - // a super iteration - for(int i=0; i>>(subgraph.d_activeNodes, graph.d_label, partitioner.partitionNodeSize[i], partitioner.fromNode[i]); + mixLabels<<>>(subgraph.d_activeNodes, graph.d_label1, graph.d_label2, partitioner.partitionNodeSize[i], partitioner.fromNode[i]); + uint itr = 0; + do + { + cout << "\t\tIteration " << ++itr << endl; + finished = true; + gpuErrorcheck(cudaMemcpy(d_finished, &finished, sizeof(bool), cudaMemcpyHostToDevice)); - //moveUpLabels<<< partitioner.partitionNodeSize[i]/512 + 1 , 512 >>>(subgraph.d_activeNodes, graph.d_label, partitioner.partitionNodeSize[i], partitioner.fromNode[i]); - mixLabels<<>>(subgraph.d_activeNodes, graph.d_label1, graph.d_label2, partitioner.partitionNodeSize[i], partitioner.fromNode[i]); - - uint itr = 0; - do - { - cout << "\t\tIteration " << ++itr << endl; - finished = true; - gpuErrorcheck(cudaMemcpy(d_finished, &finished, sizeof(bool), cudaMemcpyHostToDevice)); + sswp_async<<< partitioner.partitionNodeSize[i]/512 + 1 , 512 >>>(partitioner.partitionNodeSize[i], + partitioner.fromNode[i], + partitioner.fromEdge[i], + subgraph.d_activeNodes, + subgraph.d_activeNodesPointer, + subgraph.d_activeEdgeList, + graph.d_outDegree, + graph.d_value, + d_finished, + (itr%2==1) ? graph.d_label1 : graph.d_label2, + (itr%2==1) ? graph.d_label2 : graph.d_label1); - sswp_async<<< partitioner.partitionNodeSize[i]/512 + 1 , 512 >>>(partitioner.partitionNodeSize[i], - partitioner.fromNode[i], - partitioner.fromEdge[i], - subgraph.d_activeNodes, - subgraph.d_activeNodesPointer, - subgraph.d_activeEdgeList, - graph.d_outDegree, - graph.d_value, - d_finished, - (itr%2==1) ? graph.d_label1 : graph.d_label2, - (itr%2==1) ? graph.d_label2 : graph.d_label1); - - cudaDeviceSynchronize(); - gpuErrorcheck( cudaPeekAtLastError() ); - - gpuErrorcheck(cudaMemcpy(&finished, d_finished, sizeof(bool), cudaMemcpyDeviceToHost)); - }while(!(finished)); - - cout << itr << ((itr>1) ? " Inner Iterations" : " Inner Iteration") << " in Global Iteration " << gItr << ", Partition " << i << endl; - } - - subgen.generate(graph, subgraph); - - } - - float runtime = timer.Finish(); - cout << "Processing finished in " << runtime/1000 << " (s).\n"; - - gpuErrorcheck(cudaMemcpy(graph.value, graph.d_value, graph.num_nodes*sizeof(uint), cudaMemcpyDeviceToHost)); - - utilities::PrintResults(graph.value, min(30, graph.num_nodes)); - - if(arguments.hasOutput) - utilities::SaveResults(arguments.output, graph.value, graph.num_nodes); + cudaDeviceSynchronize(); + gpuErrorcheck( cudaPeekAtLastError() ); + gpuErrorcheck(cudaMemcpy(&finished, d_finished, sizeof(bool), cudaMemcpyDeviceToHost)); + }while(!(finished)); + cout << itr << ((itr>1) ? " Inner Iterations" : " Inner Iteration") << " in Global Iteration " << gItr << ", Partition " << i << endl; + } + subgen.generate(graph, subgraph); + } + float runtime = timer.Finish(); + cout << "Processing finished in " << runtime/1000 << " (s).\n"; + gpuErrorcheck(cudaMemcpy(graph.value, graph.d_value, n*sizeof(uint), cudaMemcpyDeviceToHost)); + utilities::PrintResults(graph.value, min(30, graph.num_nodes)); + if(arguments.hasOutput) + utilities::SaveResults(arguments.output, graph.value, graph.num_nodes); } diff --git a/subway/sswp-sync.cu b/subway/sswp-sync.cu index 32cea62..378ad72 100644 --- a/subway/sswp-sync.cu +++ b/subway/sswp-sync.cu @@ -11,93 +11,75 @@ int main(int argc, char** argv) -{ - cudaFree(0); +{ + cudaFree(0); - ArgumentParser arguments(argc, argv, true, false); - - Timer timer; - timer.Start(); - - Graph graph(arguments.input, true); - graph.ReadGraph(); - - float readtime = timer.Finish(); - cout << "Graph Reading finished in " << readtime/1000 << " (s).\n"; - - for(unsigned int i=0; i graph(arguments.input, true); + graph.ReadGraph(); + float readtime = timer.Finish(); + cout << "Graph Reading finished in " << readtime/1000 << " (s).\n"; + for(uint i=0; i subgraph(graph.num_nodes, graph.num_edges); + SubgraphGenerator subgen(graph); + subgen.generate(graph, subgraph); - gpuErrorcheck(cudaMemcpy(graph.d_outDegree, graph.outDegree, graph.num_nodes * sizeof(unsigned int), cudaMemcpyHostToDevice)); - gpuErrorcheck(cudaMemcpy(graph.d_value, graph.value, graph.num_nodes * sizeof(unsigned int), cudaMemcpyHostToDevice)); - gpuErrorcheck(cudaMemcpy(graph.d_label1, graph.label1, graph.num_nodes * sizeof(bool), cudaMemcpyHostToDevice)); - gpuErrorcheck(cudaMemcpy(graph.d_label2, graph.label2, graph.num_nodes * sizeof(bool), cudaMemcpyHostToDevice)); - - Subgraph subgraph(graph.num_nodes, graph.num_edges); - - SubgraphGenerator subgen(graph); - - subgen.generate(graph, subgraph); + Partitioner partitioner; + timer.Start(); + uint itr = 0; + while (subgraph.numActiveNodes>0) + { + itr++; + partitioner.partition(subgraph, subgraph.numActiveNodes); + // a super iteration + for(int i=0; i partitioner; - - timer.Start(); - - uint itr = 0; - - while (subgraph.numActiveNodes>0) - { - itr++; - - partitioner.partition(subgraph, subgraph.numActiveNodes); - // a super iteration - for(int i=0; i>>(subgraph.d_activeNodes, graph.d_label1, graph.d_label2, partitioner.partitionNodeSize[i], partitioner.fromNode[i]); - moveUpLabels<<< partitioner.partitionNodeSize[i]/512 + 1 , 512 >>>(subgraph.d_activeNodes, graph.d_label1, graph.d_label2, partitioner.partitionNodeSize[i], partitioner.fromNode[i]); + sswp_kernel<<< partitioner.partitionNodeSize[i]/512 + 1 , 512 >>>(partitioner.partitionNodeSize[i], + partitioner.fromNode[i], + partitioner.fromEdge[i], + subgraph.d_activeNodes, + subgraph.d_activeNodesPointer, + subgraph.d_activeEdgeList, + graph.d_outDegree, + graph.d_value, + //d_finished, + graph.d_label1, + graph.d_label2); - sswp_kernel<<< partitioner.partitionNodeSize[i]/512 + 1 , 512 >>>(partitioner.partitionNodeSize[i], - partitioner.fromNode[i], - partitioner.fromEdge[i], - subgraph.d_activeNodes, - subgraph.d_activeNodesPointer, - subgraph.d_activeEdgeList, - graph.d_outDegree, - graph.d_value, - //d_finished, - graph.d_label1, - graph.d_label2); - - cudaDeviceSynchronize(); - gpuErrorcheck( cudaPeekAtLastError() ); - } - - subgen.generate(graph, subgraph); - - } - - float runtime = timer.Finish(); - cout << "Processing finished in " << runtime/1000 << " (s).\n"; - - cout << "Number of iterations = " << itr << endl; - - gpuErrorcheck(cudaMemcpy(graph.value, graph.d_value, graph.num_nodes*sizeof(uint), cudaMemcpyDeviceToHost)); - - utilities::PrintResults(graph.value, min(30, graph.num_nodes)); - - if(arguments.hasOutput) - utilities::SaveResults(arguments.output, graph.value, graph.num_nodes); + cudaDeviceSynchronize(); + gpuErrorcheck( cudaPeekAtLastError() ); + } + subgen.generate(graph, subgraph); + } + float runtime = timer.Finish(); + cout << "Processing finished in " << runtime/1000 << " (s).\n"; + cout << "Number of iterations = " << itr << endl; + gpuErrorcheck(cudaMemcpy(graph.value, graph.d_value, n*sizeof(uint), cudaMemcpyDeviceToHost)); + utilities::PrintResults(graph.value, min(30, graph.num_nodes)); + if(arguments.hasOutput) + utilities::SaveResults(arguments.output, graph.value, graph.num_nodes); } diff --git a/tools/Makefile b/tools/Makefile index 5a06f9e..24ea489 100644 --- a/tools/Makefile +++ b/tools/Makefile @@ -1,15 +1,16 @@ CC=g++ NC=nvcc -CFLAGS=-std=c++11 -O3 -NFLAGS=-arch=sm_60 +CFLAGS=-std=c++14 -O3 + +#NFLAGS=-arch=sm_80 SHARED=../shared -all: converter +all: converter_stdin -converter: converter.cpp - $(CC) converter.cpp -o converter $(CFLAGS) +converter_stdin: converter_stdin.cpp + $(CC) converter_stdin.cpp -o converter_stdin $(CFLAGS) clean: - rm -f converter + rm -f converter_stdin diff --git a/tools/converter.cpp b/tools/converter.cpp deleted file mode 100644 index 68365ac..0000000 --- a/tools/converter.cpp +++ /dev/null @@ -1,172 +0,0 @@ -#include "../shared/globals.hpp" - - -bool IsWeightedFormat(string format) -{ - if((format == "bwcsr") || - (format == "wcsr") || - (format == "wel")) - return true; - return false; -} - -string GetFileExtension(string fileName) -{ - if(fileName.find_last_of(".") != string::npos) - return fileName.substr(fileName.find_last_of(".")+1); - return ""; -} - -int main(int argc, char** argv) -{ - if(argc!= 2) - { - cout << "\nThere was an error parsing command line arguments\n"; - exit(0); - } - - string input = string(argv[1]); - - if(GetFileExtension(input) == "el") - { - ifstream infile; - infile.open(input); - stringstream ss; - uint max = 0; - string line; - uint edgeCounter = 0; - - vector edges; - Edge newEdge; - while(getline( infile, line )) - { - ss.str(""); - ss.clear(); - ss << line; - - ss >> newEdge.source; - ss >> newEdge.end; - - edges.push_back(newEdge); - edgeCounter++; - - if(max < newEdge.source) - max = newEdge.source; - if(max < newEdge.end) - max = newEdge.end; - } - infile.close(); - - uint num_nodes = max + 1; - uint num_edges = edgeCounter; - uint *nodePointer = new uint[num_nodes+1]; - OutEdge *edgeList = new OutEdge[num_edges]; - uint *degree = new uint[num_nodes]; - for(uint i=0; i edges; - EdgeWeighted newEdge; - while(getline( infile, line )) - { - ss.str(""); - ss.clear(); - ss << line; - - ss >> newEdge.source; - ss >> newEdge.end; - ss >> newEdge.w8; - - edges.push_back(newEdge); - edgeCounter++; - - if(max < newEdge.source) - max = newEdge.source; - if(max < newEdge.end) - max = newEdge.end; - } - infile.close(); - - uint num_nodes = max + 1; - uint num_edges = edgeCounter; - uint *nodePointer = new uint[num_nodes+1]; - OutEdgeWeighted *edgeList = new OutEdgeWeighted[num_edges]; - uint *degree = new uint[num_nodes]; - for(uint i=0; i +#include + +bool IsWeightedFormat(string format) +{ + if((format == "bwcsr") || + (format == "wcsr") || + (format == "wel")) + return true; + return false; +} + +string GetFileExtension(string fileName) +{ + if(fileName.find_last_of(".") != string::npos) + return fileName.substr(fileName.find_last_of(".")+1); + return ""; +} + +void save_edge_data_to_csr(const std::string& output_filename) +{ + uint max = 0; + ull edgeCounter = 0; + vector edges; + Edge newEdge; + + std::string delim = "\t"; + for (std::string line; std::getline(std::cin, line);) + { + auto start = 0; + auto end = line.find(delim); + newEdge.source = static_cast(std::stoul(line.substr(start, end - start))); + start = end + delim.length(); + newEdge.end = static_cast(std::stoul(line.substr(start))); + edges.push_back(newEdge); + edgeCounter++; + if(max < newEdge.source) + max = newEdge.source; + if(max < newEdge.end) + max = newEdge.end; + } + uint num_nodes = max + 1; + ull num_edges = edgeCounter; + ull *nodePointer = new ull[num_nodes+1]; + OutEdge *edgeList = new OutEdge[num_edges]; + //out degree + uint *degree = new uint[num_nodes]; + for(uint i=0; i(std::stoul(line.substr(start, end - start))); + start = end + delimiter.length(); + end = line.find(delimiter, start); + edge.end = static_cast(std::stoul(line.substr(start, end - start))); + start = end + delimiter.length(); + edge.w8 = static_cast(std::stoul(line.substr(start))); +} + +void save_weighted_edge_data_to_csr(const std::string& output_filename) +{ + uint max = 0; + string line; + ull edgeCounter = 0; + + vector edges; + EdgeWeighted newEdge; + std::string delim = "\t"; + for (std::string line; std::getline(std::cin, line);) + { + parseLine(line, newEdge, delim); + edges.push_back(newEdge); + edgeCounter++; + if(max < newEdge.source) + max = newEdge.source; + if(max < newEdge.end) + max = newEdge.end; + } + uint num_nodes = max + 1; + ull num_edges = edgeCounter; + ull *nodePointer = new ull[num_nodes+1]; + OutEdgeWeighted *edgeList = new OutEdgeWeighted[num_edges]; + uint *degree = new uint[num_nodes]; + for(uint i=0; i