diff --git a/Makefile b/Makefile
index 0cf896c..851b6b1 100644
--- a/Makefile
+++ b/Makefile
@@ -2,8 +2,10 @@
 
 CC=g++
 NC=nvcc
-CFLAGS=-std=c++11 -O3
-NFLAGS=-arch=sm_60
+CFLAGS=-std=c++14 -O3
+
+#Tesla T4: sm_75  A100: sm_80
+NFLAGS=-arch=sm_75
 
 SHARED=shared
 SUBWAY=subway
@@ -14,13 +16,13 @@ DEP=$(SHARED)/timer.o $(SHARED)/argument_parsing.o $(SHARED)/graph.o $(SHARED)/s
 all: make1 make2 make3 bfs-sync cc-sync sssp-sync sswp-sync pr-sync bfs-async cc-async sssp-async sswp-async pr-async
 
 make1:
-	make -C $(SHARED)
+	make -C $(SHARED) NFLAGS=${NFLAGS}
 
 make2:
-	make -C $(SUBWAY)
+	make -C $(SUBWAY) NFLAGS=${NFLAGS}
 
 make3:
-	make -C $(TOOLS)
+	make -C $(TOOLS) NFLAGS=${NFLAGS}
 
 
 bfs-sync: $(SUBWAY)/bfs-sync.o $(DEP)
diff --git a/README.md b/README.md
index 19b164a..a5c1d98 100644
--- a/README.md
+++ b/README.md
@@ -29,13 +29,15 @@ Graph.wel ("SOURCE DESTINATION WEIGHT" for each edge in each line):
 1 2 10
 ```
 
+TAB is used as delimiter in both el and wel graph format.
+
 To convert these graph files to the binary format, run the following commands in the root folder:
 ```
-tools/converter path_to_Graph.el
-tools/converter path_to_Graph.wel
+cat path_to_Graph.el|tools/converter_stdin el output_filename.bcsr
+cat path_to_Graph.wel|tools/converter_stdin wel output_filename.bwcsr
 ```
 
-The first command converts Graph.el to the binary CSR format and generates a binary graph file with .bcsr extension under the same directory as the original file. The second command converts Graph.wel to a weighted binary graph file with .bwcsr extension.
+The first command converts Graph.el to the binary CSR format and generates a binary graph file with bcsr extension.  The second command converts Graph.wel to a weighted binary graph file with .bwcsr extension.
 
 #### Running applications in Subway
 The applications take a graph as input as well as some optional arguments. For example:
diff --git a/shared/Makefile b/shared/Makefile
index 32dd448..1ed59c9 100644
--- a/shared/Makefile
+++ b/shared/Makefile
@@ -1,7 +1,7 @@
 CC=g++
 NC=nvcc
-CFLAGS=-std=c++11 -O3
-NFLAGS=-arch=sm_60
+CFLAGS=-std=c++14 -O3
+#NFLAGS=-arch=sm_80
 
 
 all: timer.o argument_parsing.o graph.o subgraph.o partitioner.o subgraph_generator.o gpu_kernels.o subway_utilities.o test.o
diff --git a/shared/argument_parsing.cu b/shared/argument_parsing.cu
index 159d485..9377977 100644
--- a/shared/argument_parsing.cu
+++ b/shared/argument_parsing.cu
@@ -4,117 +4,117 @@
     
 ArgumentParser::ArgumentParser(int argc, char **argv, bool canHaveSource, bool canHaveItrs)
 {
-	this->argc = argc;
-	this->argv = argv;
-	this->canHaveSource = canHaveSource;
-	this->canHaveItrs = canHaveItrs;
-	
-	this->sourceNode = 0;
-	this->deviceID = 0;
-	this->numberOfItrs = 1;
-	
-	hasInput = false;
-	hasSourceNode = false;
-	hasOutput = false;
-	hasDeviceID = false;
-	hasNumberOfItrs = false;
-	
-	Parse();
+    this->argc = argc;
+    this->argv = argv;
+    this->canHaveSource = canHaveSource;
+    this->canHaveItrs = canHaveItrs;
+    
+    this->sourceNode = 0;
+    this->deviceID = 0;
+    this->numberOfItrs = 1;
+    
+    hasInput = false;
+    hasSourceNode = false;
+    hasOutput = false;
+    hasDeviceID = false;
+    hasNumberOfItrs = false;
+    
+    Parse();
 }
-	
+    
 bool ArgumentParser::Parse()
 {
-	try
-	{
-		if(argc == 1)
-		{
-			cout << GenerateHelpString();
-			exit(0);
-		}
-		
-		if(argc == 2) 
-			if ((strcmp(argv[1], "--help") == 0) || 
-				(strcmp(argv[1], "-help") == 0) || 
-				(strcmp(argv[1], "--h") == 0) || 
-				(strcmp(argv[1], "-h") == 0))
-			{
-				cout << GenerateHelpString();
-				exit(0);
-			}
-		
-		if(argc%2 == 0)
-		{
-			cout << "\nThere was an error parsing command line arguments\n";
-			cout << GenerateHelpString();
-			exit(0);
-		}
-		
-			
-		for(int i=1; i<argc-1; i=i+2)
-		{
-			//argv[i]
-			
-			if (strcmp(argv[i], "--input") == 0) {
-				input = string(argv[i+1]);
-				hasInput = true;
-			}
-			else if (strcmp(argv[i], "--output") == 0) {
-				output = string(argv[i+1]);
-				hasOutput = true;
-			}
-			else if (strcmp(argv[i], "--source") == 0 && canHaveSource) {
-				sourceNode = atoi(argv[i+1]);
-				hasSourceNode = true;
-			}
-			else if (strcmp(argv[i], "--device") == 0) {
-				deviceID = atoi(argv[i+1]);
-				hasDeviceID = true;
-				cudaSetDevice(deviceID);
-			}
-			else if (strcmp(argv[i], "--iteration") == 0 && canHaveItrs) {
-				numberOfItrs = atoi(argv[i+1]);
-				hasNumberOfItrs = true;
-			}
-			else
-			{
-				cout << "\nThere was an error parsing command line argument <" << argv[i] << ">\n";
-				cout << GenerateHelpString();
-				exit(0);
-			}
-		}
-		
-		if(hasInput)
-			return true;
-		else
-		{
-			cout << "\nInput graph file argument is required.\n";
-			cout << GenerateHelpString();
-			exit(0);
-		}
-	}
-	catch( const std::exception& strException ) {
-		std::cerr << strException.what() << "\n";
-		GenerateHelpString();
-		exit(0);
-	}
-	catch(...) {
-		std::cerr << "An exception has occurred.\n";
-		GenerateHelpString();
-		exit(0);
-	}
+    try
+    {
+        if(argc == 1)
+        {
+            cout << GenerateHelpString();
+            exit(0);
+        }
+        
+        if(argc == 2) 
+            if ((strcmp(argv[1], "--help") == 0) || 
+                (strcmp(argv[1], "-help") == 0) || 
+                (strcmp(argv[1], "--h") == 0) || 
+                (strcmp(argv[1], "-h") == 0))
+            {
+                cout << GenerateHelpString();
+                exit(0);
+            }
+        
+        if(argc%2 == 0)
+        {
+            cout << "\nThere was an error parsing command line arguments\n";
+            cout << GenerateHelpString();
+            exit(0);
+        }
+        
+            
+        for(int i=1; i<argc-1; i=i+2)
+        {
+            //argv[i]
+            
+            if (strcmp(argv[i], "--input") == 0) {
+                input = string(argv[i+1]);
+                hasInput = true;
+            }
+            else if (strcmp(argv[i], "--output") == 0) {
+                output = string(argv[i+1]);
+                hasOutput = true;
+            }
+            else if (strcmp(argv[i], "--source") == 0 && canHaveSource) {
+                sourceNode = atoi(argv[i+1]);
+                hasSourceNode = true;
+            }
+            else if (strcmp(argv[i], "--device") == 0) {
+                deviceID = atoi(argv[i+1]);
+                hasDeviceID = true;
+                cudaSetDevice(deviceID);
+            }
+            else if (strcmp(argv[i], "--iteration") == 0 && canHaveItrs) {
+                numberOfItrs = atoi(argv[i+1]);
+                hasNumberOfItrs = true;
+            }
+            else
+            {
+                cout << "\nThere was an error parsing command line argument <" << argv[i] << ">\n";
+                cout << GenerateHelpString();
+                exit(0);
+            }
+        }
+        
+        if(hasInput)
+            return true;
+        else
+        {
+            cout << "\nInput graph file argument is required.\n";
+            cout << GenerateHelpString();
+            exit(0);
+        }
+    }
+    catch( const std::exception& strException ) {
+        std::cerr << strException.what() << "\n";
+        GenerateHelpString();
+        exit(0);
+    }
+    catch(...) {
+        std::cerr << "An exception has occurred.\n";
+        GenerateHelpString();
+        exit(0);
+    }
 }
 
 string ArgumentParser::GenerateHelpString(){
-	string str = "\nRequired arguments:";
-	str += "\n    [--input]: Input graph file. E.g., --input FacebookGraph.txt";
-	str += "\nOptional arguments";
-	if(canHaveSource)
-		str += "\n    [--source]:  Begins from the source (Default: 0). E.g., --source 10";
-	str += "\n    [--output]: Output file for results. E.g., --output results.txt";
-	str += "\n    [--device]: Select GPU device (default: 0). E.g., --device 1";
-	if(canHaveItrs)
-		str += "\n    [--iteration]: Number of iterations (default: 1). E.g., --iterations 10";
-	str += "\n\n";
-	return str;
+    string str = "\nRequired arguments:";
+    str += "\n    [--input]: Input graph file. E.g., --input FacebookGraph.txt";
+    str += "\nOptional arguments";
+    if(canHaveSource)
+        str += "\n    [--source]:  Begins from the source (Default: 0). E.g., --source 10";
+    str += "\n    [--output]: Output file for results. E.g., --output results.txt";
+    str += "\n    [--device]: Select GPU device (default: 0). E.g., --device 1";
+    if(canHaveItrs)
+        str += "\n    [--iteration]: Number of iterations (default: 1). E.g., --iterations 10";
+    str += "\n\n";
+    return str;
 }
 
diff --git a/shared/argument_parsing.cuh b/shared/argument_parsing.cuh
index 25dd771..b8ba411 100644
--- a/shared/argument_parsing.cuh
+++ b/shared/argument_parsing.cuh
@@ -9,31 +9,31 @@ class ArgumentParser
 private:
 
 public:
-	int argc;
-	char** argv;
-	
-	bool canHaveSource;
-	bool canHaveItrs;
-	
-	bool hasInput;
-	bool hasSourceNode;
-	bool hasOutput;
-	bool hasDeviceID;
-	bool hasNumberOfItrs;
-	string input;
-	int sourceNode;
-	string output;
-	int deviceID;
-	int numberOfItrs;
-	
-	
-	ArgumentParser(int argc, char **argv, bool canHaveSource, bool canHaveItrs);
-	
-	bool Parse();
-	
-	string GenerateHelpString();
-	
+    int argc;
+    char** argv;
+    
+    bool canHaveSource;
+    bool canHaveItrs;
+    
+    bool hasInput;
+    bool hasSourceNode;
+    bool hasOutput;
+    bool hasDeviceID;
+    bool hasNumberOfItrs;
+    string input;
+    int sourceNode;
+    string output;
+    int deviceID;
+    int numberOfItrs;
+    
+    
+    ArgumentParser(int argc, char **argv, bool canHaveSource, bool canHaveItrs);
+    
+    bool Parse();
+    
+    string GenerateHelpString();
+    
 };
 
 
-#endif	//	ARGUMENT_PARSING_HPP
+#endif    //    ARGUMENT_PARSING_HPP
diff --git a/shared/globals.hpp b/shared/globals.hpp
index 99b6c03..3fbe3f3 100644
--- a/shared/globals.hpp
+++ b/shared/globals.hpp
@@ -40,12 +40,12 @@ struct OutEdgeWeighted{
 };
 
 struct Edge{
-	uint source;
+    uint source;
     uint end;
 };
 
 struct EdgeWeighted{
-	uint source;
+    uint source;
     uint end;
     uint w8;
 };
@@ -53,4 +53,4 @@ struct EdgeWeighted{
 
 
 
-#endif 	//	GLOBALS_HPP
+#endif     //    GLOBALS_HPP
diff --git a/shared/gpu_error_check.cuh b/shared/gpu_error_check.cuh
index 108d071..c175615 100644
--- a/shared/gpu_error_check.cuh
+++ b/shared/gpu_error_check.cuh
@@ -1,5 +1,5 @@
-#ifndef	GPU_ERROR_CHECK_CUH
-#define	GPU_ERROR_CHECK_CUH
+#ifndef    GPU_ERROR_CHECK_CUH
+#define    GPU_ERROR_CHECK_CUH
 
 //#include <string>
 //#include <sstream>
@@ -15,4 +15,4 @@ inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=t
    }
 }
 
-#endif	//	GPU_ERROR_CHECK_CUH
+#endif    //    GPU_ERROR_CHECK_CUH
diff --git a/shared/gpu_kernels.cu b/shared/gpu_kernels.cu
index 2955cee..ac11142 100644
--- a/shared/gpu_kernels.cu
+++ b/shared/gpu_kernels.cu
@@ -1,4 +1,3 @@
-
 #include "gpu_kernels.cuh"
 #include "globals.hpp"
 #include "gpu_error_check.cuh"
@@ -6,530 +5,465 @@
 #include "subgraph.cuh"
 
 
-__global__ void bfs_kernel(unsigned int numNodes,
-							unsigned int from,
-							unsigned int numPartitionedEdges,
-							unsigned int *activeNodes,
-							unsigned int *activeNodesPointer,
-							OutEdge *edgeList,
-							unsigned int *outDegree,
-							unsigned int *value,
-							//bool *finished,
-							bool *label1,
-							bool *label2)
+__global__ void bfs_kernel(uint numNodes,
+                           uint from,
+                           ull numPartitionedEdges,
+                           uint *activeNodes,
+                           ull *activeNodesPointer,
+                           OutEdge *edgeList,
+                           uint *outDegree,
+                           uint *value,
+                           bool *label1,
+                           bool *label2)
 {
-	unsigned int tId = blockDim.x * blockIdx.x + threadIdx.x;
-
-	if(tId < numNodes)
-	{
-		unsigned int id = activeNodes[from + tId];
-		
-		if(label1[id] == false)
-			return;
-			
-		label1[id] = false;
-
-		unsigned int sourceWeight = value[id];
-
-		unsigned int thisFrom = activeNodesPointer[from+tId]-numPartitionedEdges;
-		unsigned int degree = outDegree[id];
-		unsigned int thisTo = thisFrom + degree;
-		
-		//printf("******* %i\n", thisFrom);
-		
-		unsigned int finalDist;
-		
-		for(unsigned int i=thisFrom; i<thisTo; i++)
-		{	
-			//finalDist = sourceWeight + edgeList[i].w8;
-			finalDist = sourceWeight + 1;
-			if(finalDist < value[edgeList[i].end])
-			{
-				atomicMin(&value[edgeList[i].end] , finalDist);
-
-				//*finished = false;
-				
-				//label1[edgeList[i].end] = true;
-
-				label2[edgeList[i].end] = true;
-			}
-		}
-	}
+    uint tId = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if(tId < numNodes)
+    {
+        uint id = activeNodes[from + tId];
+        if(label1[id] == false)
+            return;
+        label1[id] = false;
+        uint sourceWeight = value[id];
+
+        ull thisFrom = activeNodesPointer[from+tId]-numPartitionedEdges;
+        uint degree = outDegree[id];
+        ull thisTo = thisFrom + degree;
+        //printf("******* %i\n", thisFrom);
+        uint finalDist;
+        for(ull i=thisFrom; i<thisTo; i++)
+        {
+            //finalDist = sourceWeight + edgeList[i].w8;
+            finalDist = sourceWeight + 1;
+            if(finalDist < value[edgeList[i].end])
+            {
+                atomicMin(&value[edgeList[i].end] , finalDist);
+
+                //*finished = false;
+                //label1[edgeList[i].end] = true;
+
+                label2[edgeList[i].end] = true;
+            }
+        }
+    }
 }
 
-__global__ void cc_kernel(unsigned int numNodes,
-							unsigned int from,
-							unsigned int numPartitionedEdges,
-							unsigned int *activeNodes,
-							unsigned int *activeNodesPointer,
-							OutEdge *edgeList,
-							unsigned int *outDegree,
-							unsigned int *dist,
-							//bool *finished,
-							bool *label1,
-							bool *label2)
+__global__ void cc_kernel(uint numNodes,
+                          uint from,
+                          ull numPartitionedEdges,
+                          uint *activeNodes,
+                          ull *activeNodesPointer,
+                          OutEdge *edgeList,
+                          uint *outDegree,
+                          uint *dist,
+                          //bool *finished,
+                          bool *label1,
+                          bool *label2)
 {
-	unsigned int tId = blockDim.x * blockIdx.x + threadIdx.x;
-
-	if(tId < numNodes)
-	{
-		unsigned int id = activeNodes[from + tId];
-		
-		if(label1[id] == false)
-			return;
-			
-		label1[id] = false;
-		
-		unsigned int sourceWeight = dist[id];
-
-		unsigned int thisFrom = activeNodesPointer[from+tId]-numPartitionedEdges;
-		unsigned int degree = outDegree[id];
-		unsigned int thisTo = thisFrom + degree;
-		
-		//printf("******* %i\n", thisFrom);
-		
-		//unsigned int finalDist;
-		
-		for(unsigned int i=thisFrom; i<thisTo; i++)
-		{	
-			//finalDist = sourceWeight + edgeList[i].w8;
-			if(sourceWeight < dist[edgeList[i].end])
-			{
-				atomicMin(&dist[edgeList[i].end] , sourceWeight);
-
-				//*finished = false;
-				
-				//label1[edgeList[i].end] = true;
-
-				label2[edgeList[i].end] = true;
-			}
-		}
-	}
+    uint tId = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if(tId < numNodes)
+    {
+        uint id = activeNodes[from + tId];
+        if(label1[id] == false)
+            return;
+        label1[id] = false;
+        uint sourceWeight = dist[id];
+
+        ull thisFrom = activeNodesPointer[from+tId]-numPartitionedEdges;
+        uint degree = outDegree[id];
+        ull thisTo = thisFrom + degree;
+        //printf("******* %i\n", thisFrom);
+        //unsigned int finalDist;
+        for(ull i=thisFrom; i<thisTo; i++)
+        {
+            //finalDist = sourceWeight + edgeList[i].w8;
+            if(sourceWeight < dist[edgeList[i].end])
+            {
+                atomicMin(&dist[edgeList[i].end] , sourceWeight);
+
+                //*finished = false;
+                //label1[edgeList[i].end] = true;
+
+                label2[edgeList[i].end] = true;
+            }
+        }
+    }
 }
 
 
-__global__ void sssp_kernel(unsigned int numNodes,
-							unsigned int from,
-							unsigned int numPartitionedEdges,
-							unsigned int *activeNodes,
-							unsigned int *activeNodesPointer,
-							OutEdgeWeighted *edgeList,
-							unsigned int *outDegree,
-							unsigned int *dist,
-							//bool *finished,
-							bool *label1,
-							bool *label2)
+__global__ void sssp_kernel(uint numNodes,
+                            uint from,
+                            ull numPartitionedEdges,
+                            uint *activeNodes,
+                            ull *activeNodesPointer,
+                            OutEdgeWeighted *edgeList,
+                            uint *outDegree,
+                            uint *dist,
+                            //bool *finished,
+                            bool *label1,
+                            bool *label2)
 {
-	unsigned int tId = blockDim.x * blockIdx.x + threadIdx.x;
-
-	if(tId < numNodes)
-	{
-		unsigned int id = activeNodes[from + tId];
-		
-		if(label1[id] == false)
-			return;
-			
-		label1[id] = false;
-
-		unsigned int sourceWeight = dist[id];
-
-		unsigned int thisFrom = activeNodesPointer[from+tId]-numPartitionedEdges;
-		unsigned int degree = outDegree[id];
-		unsigned int thisTo = thisFrom + degree;
-		
-		//printf("******* %i\n", thisFrom);
-		
-		unsigned int finalDist;
-		
-		for(unsigned int i=thisFrom; i<thisTo; i++)
-		{	
-			finalDist = sourceWeight + edgeList[i].w8;
-			if(finalDist < dist[edgeList[i].end])
-			{
-				atomicMin(&dist[edgeList[i].end] , finalDist);
-
-				//*finished = false;
-				
-				//label1[edgeList[i].end] = true;
-
-				label2[edgeList[i].end] = true;
-			}
-		}
-	}
+    uint tId = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if(tId < numNodes)
+    {
+        uint id = activeNodes[from + tId];
+        if(label1[id] == false)
+            return;
+        label1[id] = false;
+
+        uint sourceWeight = dist[id];
+
+        ull thisFrom = activeNodesPointer[from+tId]-numPartitionedEdges;
+        uint degree = outDegree[id];
+        ull thisTo = thisFrom + degree;
+        //printf("******* %i\n", thisFrom);
+        uint finalDist;
+        for(ull i=thisFrom; i < thisTo; i++)
+        {
+            finalDist = sourceWeight + edgeList[i].w8;
+            if(finalDist < dist[edgeList[i].end])
+            {
+                atomicMin(&dist[edgeList[i].end] , finalDist);
+
+                //*finished = false;
+                //label1[edgeList[i].end] = true;
+
+                label2[edgeList[i].end] = true;
+            }
+        }
+    }
 }
 
-__global__ void sswp_kernel(unsigned int numNodes,
-							unsigned int from,
-							unsigned int numPartitionedEdges,
-							unsigned int *activeNodes,
-							unsigned int *activeNodesPointer,
-							OutEdgeWeighted *edgeList,
-							unsigned int *outDegree,
-							unsigned int *dist,
-							//bool *finished,
-							bool *label1,
-							bool *label2)
+__global__ void sswp_kernel(uint numNodes,
+                            uint from,
+                            ull numPartitionedEdges,
+                            uint *activeNodes,
+                            ull *activeNodesPointer,
+                            OutEdgeWeighted *edgeList,
+                            uint *outDegree,
+                            uint *dist,
+                            //bool *finished,
+                            bool *label1,
+                            bool *label2)
 {
-	unsigned int tId = blockDim.x * blockIdx.x + threadIdx.x;
-
-	if(tId < numNodes)
-	{
-		unsigned int id = activeNodes[from + tId];
-		
-		if(label1[id] == false)
-			return;
-			
-		label1[id] = false;
-		
-		unsigned int sourceWeight = dist[id];
-
-		unsigned int thisFrom = activeNodesPointer[from+tId]-numPartitionedEdges;
-		unsigned int degree = outDegree[id];
-		unsigned int thisTo = thisFrom + degree;
-		
-		//printf("******* %i\n", thisFrom);
-		
-		unsigned int finalDist;
-		
-		for(unsigned int i=thisFrom; i<thisTo; i++)
-		{	
-			finalDist = min(sourceWeight, edgeList[i].w8);
-			if(finalDist > dist[edgeList[i].end])
-			{
-				atomicMax(&dist[edgeList[i].end] , finalDist);
-
-				//*finished = false;
-				
-				//label1[edgeList[i].end] = true;
-
-				label2[edgeList[i].end] = true;
-			}
-		}
-	}
+    uint tId = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if(tId < numNodes)
+    {
+        uint id = activeNodes[from + tId];
+        if(label1[id] == false)
+            return;
+        label1[id] = false;
+        uint sourceWeight = dist[id];
+
+        ull thisFrom = activeNodesPointer[from+tId]-numPartitionedEdges;
+        uint degree = outDegree[id];
+        ull thisTo = thisFrom + degree;
+        //printf("******* %i\n", thisFrom);
+        uint finalDist;
+        for(ull i=thisFrom; i<thisTo; i++)
+        {
+            finalDist = min(sourceWeight, edgeList[i].w8);
+            if(finalDist > dist[edgeList[i].end])
+            {
+                atomicMax(&dist[edgeList[i].end] , finalDist);
+
+                //*finished = false;
+                //label1[edgeList[i].end] = true;
+
+                label2[edgeList[i].end] = true;
+            }
+        }
+    }
 }
 
-__global__ void pr_kernel(unsigned int numNodes,
-							unsigned int from,
-							unsigned int numPartitionedEdges,
-							unsigned int *activeNodes,
-							unsigned int *activeNodesPointer,
-							OutEdge *edgeList,
-							unsigned int *outDegree,
-							float *dist,
-							float *delta,
-							//bool *finished,
-							float acc)
+__global__ void pr_kernel(uint numNodes,
+                          uint from,
+                          ull numPartitionedEdges,
+                          uint *activeNodes,
+                          ull *activeNodesPointer,
+                          OutEdge *edgeList,
+                          uint *outDegree,
+                          float *dist,
+                          float *delta,
+                          //bool *finished,
+                          float acc)
 {
-	unsigned int tId = blockDim.x * blockIdx.x + threadIdx.x;
-
-	if(tId < numNodes)
-	{
-		unsigned int id = activeNodes[from + tId];
-		unsigned int degree = outDegree[id];
-		float thisDelta = delta[id];
-
-		if(thisDelta > acc)
-		{
-			dist[id] += thisDelta;
-			
-			if(degree != 0)
-			{
-				//*finished = false;
-				
-				float sourcePR = ((float) thisDelta / degree) * 0.85;
-
-				unsigned int thisfrom = activeNodesPointer[from+tId]-numPartitionedEdges;
-				unsigned int thisto = thisfrom + degree;
-				
-				for(unsigned int i=thisfrom; i<thisto; i++)
-				{
-					atomicAdd(&delta[edgeList[i].end], sourcePR);
-				}				
-			}
-			
-			atomicAdd(&delta[id], -thisDelta);
-		}
-		
-	}
+    uint tId = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if(tId < numNodes)
+    {
+        uint id = activeNodes[from + tId];
+        uint degree = outDegree[id];
+        float thisDelta = delta[id];
+
+        if(thisDelta > acc)
+        {
+            dist[id] += thisDelta;
+            if(degree != 0)
+            {
+                //*finished = false;
+                float sourcePR = ((float) thisDelta / degree) * 0.85;
+
+                ull thisfrom = activeNodesPointer[from+tId]-numPartitionedEdges;
+                ull thisto = thisfrom + degree;
+                for(ull i=thisfrom; i<thisto; i++)
+                {
+                    atomicAdd(&delta[edgeList[i].end], sourcePR);
+                }
+            }
+            atomicAdd(&delta[id], -thisDelta);
+        }
+    }
 }
 
 
-__global__ void bfs_async(unsigned int numNodes,
-							unsigned int from,
-							unsigned int numPartitionedEdges,
-							unsigned int *activeNodes,
-							unsigned int *activeNodesPointer,
-							OutEdge *edgeList,
-							unsigned int *outDegree,
-							unsigned int *dist,
-							bool *finished,
-							bool *label1,
-							bool *label2)
+__global__ void bfs_async(uint numNodes,
+                          uint from,
+                          ull numPartitionedEdges,
+                          uint *activeNodes,
+                          ull *activeNodesPointer,
+                          OutEdge *edgeList,
+                          uint *outDegree,
+                          uint *dist,
+                          bool *finished,
+                          bool *label1,
+                          bool *label2)
 {
-	unsigned int tId = blockDim.x * blockIdx.x + threadIdx.x;
-
-	if(tId < numNodes)
-	{
-		unsigned int id = activeNodes[from + tId];
-		
-		if(label1[id] == false)
-			return;
-			
-		label1[id] = false;
-		
-		unsigned int sourceWeight = dist[id];
-
-		unsigned int thisFrom = activeNodesPointer[from+tId]-numPartitionedEdges;
-		unsigned int degree = outDegree[id];
-		unsigned int thisTo = thisFrom + degree;
-		
-		//printf("******* %i\n", thisFrom);
-		
-		unsigned int finalDist;
-		
-		for(unsigned int i=thisFrom; i<thisTo; i++)
-		{	
-			//finalDist = sourceWeight + edgeList[i].w8;
-			finalDist = sourceWeight + 1;
-			if(finalDist < dist[edgeList[i].end])
-			{
-				atomicMin(&dist[edgeList[i].end] , finalDist);
-
-				*finished = false;
-				
-				//label1[edgeList[i].end] = true;
-
-				label2[edgeList[i].end] = true;
-			}
-		}
-	}
+    uint tId = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if(tId < numNodes)
+    {
+        uint id = activeNodes[from + tId];
+        if(label1[id] == false)
+            return;
+        label1[id] = false;
+        uint sourceWeight = dist[id];
+
+        ull thisFrom = activeNodesPointer[from+tId]-numPartitionedEdges;
+        uint degree = outDegree[id];
+        ull thisTo = thisFrom + degree;
+        //printf("******* %i\n", thisFrom);
+        uint finalDist;
+        for(ull i=thisFrom; i<thisTo; i++)
+        {
+            //finalDist = sourceWeight + edgeList[i].w8;
+            finalDist = sourceWeight + 1;
+            if(finalDist < dist[edgeList[i].end])
+            {
+                atomicMin(&dist[edgeList[i].end] , finalDist);
+
+                *finished = false;
+                //label1[edgeList[i].end] = true;
+
+                label2[edgeList[i].end] = true;
+            }
+        }
+    }
 }
 
-__global__ void sssp_async(unsigned int numNodes,
-							unsigned int from,
-							unsigned int numPartitionedEdges,
-							unsigned int *activeNodes,
-							unsigned int *activeNodesPointer,
-							OutEdgeWeighted *edgeList,
-							unsigned int *outDegree,
-							unsigned int *dist,
-							bool *finished,
-							bool *label1,
-							bool *label2)
+__global__ void sssp_async(uint numNodes,
+                           uint from,
+                           ull numPartitionedEdges,
+                           uint *activeNodes,
+                           ull *activeNodesPointer,
+                           OutEdgeWeighted *edgeList,
+                           uint *outDegree,
+                           uint *dist,
+                           bool *finished,
+                           bool *label1,
+                           bool *label2)
 {
-	unsigned int tId = blockDim.x * blockIdx.x + threadIdx.x;
-
-	if(tId < numNodes)
-	{
-		unsigned int id = activeNodes[from + tId];
-		
-		if(label1[id] == false)
-			return;
-			
-		label1[id] = false;
-		
-		unsigned int sourceWeight = dist[id];
-
-		unsigned int thisFrom = activeNodesPointer[from+tId]-numPartitionedEdges;
-		unsigned int degree = outDegree[id];
-		unsigned int thisTo = thisFrom + degree;
-		
-		//printf("******* %i\n", thisFrom);
-		
-		unsigned int finalDist;
-		
-		for(unsigned int i=thisFrom; i<thisTo; i++)
-		{	
-			finalDist = sourceWeight + edgeList[i].w8;
-			if(finalDist < dist[edgeList[i].end])
-			{
-				atomicMin(&dist[edgeList[i].end] , finalDist);
-
-				*finished = false;
-				
-				//label1[edgeList[i].end] = true;
-
-				label2[edgeList[i].end] = true;
-			}
-		}
-	}
+    uint tId = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if(tId < numNodes)
+    {
+        uint id = activeNodes[from + tId];
+        if(label1[id] == false)
+            return;
+        label1[id] = false;
+        uint sourceWeight = dist[id];
+
+        ull thisFrom = activeNodesPointer[from+tId]-numPartitionedEdges;
+        uint degree = outDegree[id];
+        ull thisTo = thisFrom + degree;
+        //printf("******* %i\n", thisFrom);
+        uint finalDist;
+        for(ull i=thisFrom; i<thisTo; i++)
+        {
+            finalDist = sourceWeight + edgeList[i].w8;
+            if(finalDist < dist[edgeList[i].end])
+            {
+                atomicMin(&dist[edgeList[i].end] , finalDist);
+
+                *finished = false;
+                //label1[edgeList[i].end] = true;
+
+                label2[edgeList[i].end] = true;
+            }
+        }
+    }
 }
 
-__global__ void sswp_async(unsigned int numNodes,
-							unsigned int from,
-							unsigned int numPartitionedEdges,
-							unsigned int *activeNodes,
-							unsigned int *activeNodesPointer,
-							OutEdgeWeighted *edgeList,
-							unsigned int *outDegree,
-							unsigned int *dist,
-							bool *finished,
-							bool *label1,
-							bool *label2)
+__global__ void sswp_async(uint numNodes,
+                           uint from,
+                           ull numPartitionedEdges,
+                           uint *activeNodes,
+                           ull *activeNodesPointer,
+                           OutEdgeWeighted *edgeList,
+                           uint *outDegree,
+                           uint *dist,
+                           bool *finished,
+                           bool *label1,
+                           bool *label2)
 {
-	unsigned int tId = blockDim.x * blockIdx.x + threadIdx.x;
-
-	if(tId < numNodes)
-	{
-		unsigned int id = activeNodes[from + tId];
-		
-		if(label1[id] == false)
-			return;
-			
-		label1[id] = false;
-		
-		unsigned int sourceWeight = dist[id];
-
-		unsigned int thisFrom = activeNodesPointer[from+tId]-numPartitionedEdges;
-		unsigned int degree = outDegree[id];
-		unsigned int thisTo = thisFrom + degree;
-		
-		
-		unsigned int finalDist;
-		
-		for(unsigned int i=thisFrom; i<thisTo; i++)
-		{	
-			finalDist = min(sourceWeight, edgeList[i].w8);
-			if(finalDist > dist[edgeList[i].end])
-			{
-				atomicMax(&dist[edgeList[i].end] , finalDist);
-
-				*finished = false;
-				
-				//label1[edgeList[i].end] = true;
-
-				label2[edgeList[i].end] = true;
-			}
-		}
-	}
+    uint tId = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if(tId < numNodes)
+    {
+        uint id = activeNodes[from + tId];
+        if(label1[id] == false)
+            return;
+        label1[id] = false;
+        uint sourceWeight = dist[id];
+
+        ull thisFrom = activeNodesPointer[from+tId]-numPartitionedEdges;
+        uint degree = outDegree[id];
+        uint thisTo = thisFrom + degree;
+        uint finalDist;
+        for(ull i=thisFrom; i<thisTo; i++)
+        {
+            finalDist = min(sourceWeight, edgeList[i].w8);
+            if(finalDist > dist[edgeList[i].end])
+            {
+                atomicMax(&dist[edgeList[i].end] , finalDist);
+
+                *finished = false;
+                //label1[edgeList[i].end] = true;
+
+                label2[edgeList[i].end] = true;
+            }
+        }
+    }
 }
 
 
-__global__ void cc_async(unsigned int numNodes,
-							unsigned int from,
-							unsigned int numPartitionedEdges,
-							unsigned int *activeNodes,
-							unsigned int *activeNodesPointer,
-							OutEdge *edgeList,
-							unsigned int *outDegree,
-							unsigned int *dist,
-							bool *finished,
-							bool *label1,
-							bool *label2)
+__global__ void cc_async(uint numNodes,
+                         uint from,
+                         ull numPartitionedEdges,
+                         uint *activeNodes,
+                         ull *activeNodesPointer,
+                         OutEdge *edgeList,
+                         uint *outDegree,
+                         uint *dist,
+                         bool *finished,
+                         bool *label1,
+                         bool *label2)
 {
-	unsigned int tId = blockDim.x * blockIdx.x + threadIdx.x;
-
-	if(tId < numNodes)
-	{
-		unsigned int id = activeNodes[from + tId];
-		
-		if(label1[id] == false)
-			return;
-			
-		label1[id] = false;
-
-		unsigned int sourceWeight = dist[id];
-
-		unsigned int thisFrom = activeNodesPointer[from+tId]-numPartitionedEdges;
-		unsigned int degree = outDegree[id];
-		unsigned int thisTo = thisFrom + degree;
-		
-		//printf("******* %i\n", thisFrom);
-		
-		//unsigned int finalDist;
-		
-		for(unsigned int i=thisFrom; i<thisTo; i++)
-		{	
-			//finalDist = sourceWeight + edgeList[i].w8;
-			if(sourceWeight < dist[edgeList[i].end])
-			{
-				atomicMin(&dist[edgeList[i].end] , sourceWeight);
-
-				*finished = false;
-				
-				//label1[edgeList[i].end] = true;
-
-				label2[edgeList[i].end] = true;
-			}
-		}
-	}
+    uint tId = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if(tId < numNodes)
+    {
+        uint id = activeNodes[from + tId];
+        if(label1[id] == false)
+            return;
+        label1[id] = false;
+
+        uint sourceWeight = dist[id];
+
+        ull thisFrom = activeNodesPointer[from+tId]-numPartitionedEdges;
+        uint degree = outDegree[id];
+        ull thisTo = thisFrom + degree;
+        //printf("******* %i\n", thisFrom);
+        //unsigned int finalDist;
+        for(ull i=thisFrom; i<thisTo; i++)
+        {
+            //finalDist = sourceWeight + edgeList[i].w8;
+            if(sourceWeight < dist[edgeList[i].end])
+            {
+                atomicMin(&dist[edgeList[i].end] , sourceWeight);
+
+                *finished = false;
+                //label1[edgeList[i].end] = true;
+
+                label2[edgeList[i].end] = true;
+            }
+        }
+    }
 }
 
 
-__global__ void pr_async(unsigned int numNodes,
-							unsigned int from,
-							unsigned int numPartitionedEdges,
-							unsigned int *activeNodes,
-							unsigned int *activeNodesPointer,
-							OutEdge *edgeList,
-							unsigned int *outDegree,
-							float *dist,
-							float *delta,
-							bool *finished,
-							float acc)
+__global__ void pr_async(uint numNodes,
+                         uint from,
+                         ull numPartitionedEdges,
+                         uint *activeNodes,
+                         ull *activeNodesPointer,
+                         OutEdge *edgeList,
+                         uint *outDegree,
+                         float *dist,
+                         float *delta,
+                         bool *finished,
+                         float acc)
 {
-	unsigned int tId = blockDim.x * blockIdx.x + threadIdx.x;
-
-	if(tId < numNodes)
-	{
-		unsigned int id = activeNodes[from + tId];
-		unsigned int degree = outDegree[id];
-		float thisDelta = delta[id];
-
-		if(thisDelta > acc)
-		{
-			dist[id] += thisDelta;
-			
-			if(degree != 0)
-			{
-				*finished = false;
-				
-				float sourcePR = ((float) thisDelta / degree) * 0.85;
-
-				unsigned int thisfrom = activeNodesPointer[from+tId]-numPartitionedEdges;
-				unsigned int thisto = thisfrom + degree;
-				
-				for(unsigned int i=thisfrom; i<thisto; i++)
-				{
-					atomicAdd(&delta[edgeList[i].end], sourcePR);
-				}				
-			}
-			
-			atomicAdd(&delta[id], -thisDelta);
-		}
-		
-	}
+    uint tId = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if(tId < numNodes)
+    {
+        uint id = activeNodes[from + tId];
+        uint degree = outDegree[id];
+        float thisDelta = delta[id];
+
+        if(thisDelta > acc)
+        {
+            dist[id] += thisDelta;
+            if(degree != 0)
+            {
+                *finished = false;
+                float sourcePR = ((float) thisDelta / degree) * 0.85;
+
+                ull thisfrom = activeNodesPointer[from+tId]-numPartitionedEdges;
+                ull thisto = thisfrom + degree;
+                for(ull i=thisfrom; i<thisto; i++)
+                {
+                    atomicAdd(&delta[edgeList[i].end], sourcePR);
+                }
+            }
+            atomicAdd(&delta[id], -thisDelta);
+        }
+    }
 }
 
 
 
-__global__ void clearLabel(unsigned int * activeNodes, bool *label, unsigned int size, unsigned int from)
+__global__ void clearLabel(uint * activeNodes, bool *label, uint size, uint from)
 {
-	unsigned int id = blockDim.x * blockIdx.x + threadIdx.x;
-	if(id < size)
-	{
-		label[activeNodes[id+from]] = false;
-	}
+    uint id = blockDim.x * blockIdx.x + threadIdx.x;
+    if(id < size)
+    {
+        label[activeNodes[id+from]] = false;
+    }
 }
 
-__global__ void mixLabels(unsigned int * activeNodes, bool *label1, bool *label2, unsigned int size, unsigned int from)
+__global__ void mixLabels(uint * activeNodes, bool *label1, bool *label2, uint size, uint from)
 {
-	unsigned int id = blockDim.x * blockIdx.x + threadIdx.x;
-	if(id < size){
-		int nID = activeNodes[id+from];
-		label1[nID] = label1[nID] || label2[nID];
-		label2[nID] = false;	
-	}
+    uint id = blockDim.x * blockIdx.x + threadIdx.x;
+    if(id < size){
+        uint nID = activeNodes[id+from];
+        label1[nID] = label1[nID] || label2[nID];
+        label2[nID] = false;
+    }
 }
 
-__global__ void moveUpLabels(unsigned int * activeNodes, bool *label1, bool *label2, unsigned int size, unsigned int from)
+__global__ void moveUpLabels(uint * activeNodes, bool *label1, bool *label2, uint size, uint from)
 {
-	unsigned int id = blockDim.x * blockIdx.x + threadIdx.x;
-	unsigned int nID;
-	if(id < size){
-		nID = activeNodes[id+from];
-		label1[nID] = label2[nID];
-		label2[nID] = false;	
-	}
+    uint id = blockDim.x * blockIdx.x + threadIdx.x;
+    uint nID;
+    if(id < size){
+        nID = activeNodes[id+from];
+        label1[nID] = label2[nID];
+        label2[nID] = false;
+    }
 }
 
diff --git a/shared/gpu_kernels.cuh b/shared/gpu_kernels.cuh
index 393f88a..eb3fb6d 100644
--- a/shared/gpu_kernels.cuh
+++ b/shared/gpu_kernels.cuh
@@ -1,133 +1,132 @@
-
 #include "globals.hpp"
 #include "graph.cuh"
 #include "subgraph.cuh"
 
 
-__global__ void bfs_kernel(unsigned int numNodes,
-							unsigned int from,
-							unsigned int numPartitionedEdges,
-							unsigned int *activeNodes,
-							unsigned int *activeNodesPointer,
-							OutEdge *edgeList,
-							unsigned int *outDegree,
-							unsigned int *value,
-							//bool *finished,
-							bool *label1,
-							bool *label2);
-
-__global__ void cc_kernel(unsigned int numNodes,
-							unsigned int from,
-							unsigned int numPartitionedEdges,
-							unsigned int *activeNodes,
-							unsigned int *activeNodesPointer,
-							OutEdge *edgeList,
-							unsigned int *outDegree,
-							unsigned int *dist,
-							//bool *finished,
-							bool *label1,
-							bool *label2);
-
-__global__ void sssp_kernel(unsigned int numNodes,
-							unsigned int from,
-							unsigned int numPartitionedEdges,
-							unsigned int *activeNodes,
-							unsigned int *activeNodesPointer,
-							OutEdgeWeighted *edgeList,
-							unsigned int *outDegree,
-							unsigned int *dist,
-							//bool *finished,
-							bool *label1,
-							bool *label2);
-							
-__global__ void sswp_kernel(unsigned int numNodes,
-							unsigned int from,
-							unsigned int numPartitionedEdges,
-							unsigned int *activeNodes,
-							unsigned int *activeNodesPointer,
-							OutEdgeWeighted *edgeList,
-							unsigned int *outDegree,
-							unsigned int *dist,
-							//bool *finished,
-							bool *label1,
-							bool *label2);
-							
-__global__ void pr_kernel(unsigned int numNodes,
-							unsigned int from,
-							unsigned int numPartitionedEdges,
-							unsigned int *activeNodes,
-							unsigned int *activeNodesPointer,
-							OutEdge *edgeList,
-							unsigned int *outDegree,
-							float *dist,
-							float *delta,
-							//bool *finished,
-							float acc);						
-
-__global__ void bfs_async(unsigned int numNodes,
-							unsigned int from,
-							unsigned int numPartitionedEdges,
-							unsigned int *activeNodes,
-							unsigned int *activeNodesPointer,
-							OutEdge *edgeList,
-							unsigned int *outDegree,
-							unsigned int *dist,
-							bool *finished,
-							bool *label1,
-							bool *label2);	
-							
-__global__ void sssp_async(unsigned int numNodes,
-							unsigned int from,
-							unsigned int numPartitionedEdges,
-							unsigned int *activeNodes,
-							unsigned int *activeNodesPointer,
-							OutEdgeWeighted *edgeList,
-							unsigned int *outDegree,
-							unsigned int *dist,
-							bool *finished,
-							bool *label1,
-							bool *label2);
-							
-__global__ void sswp_async(unsigned int numNodes,
-							unsigned int from,
-							unsigned int numPartitionedEdges,
-							unsigned int *activeNodes,
-							unsigned int *activeNodesPointer,
-							OutEdgeWeighted *edgeList,
-							unsigned int *outDegree,
-							unsigned int *dist,
-							bool *finished,
-							bool *label1,
-							bool *label2);
-							
-__global__ void cc_async(unsigned int numNodes,
-							unsigned int from,
-							unsigned int numPartitionedEdges,
-							unsigned int *activeNodes,
-							unsigned int *activeNodesPointer,
-							OutEdge *edgeList,
-							unsigned int *outDegree,
-							unsigned int *dist,
-							bool *finished,
-							bool *label1,
-							bool *label2);		
-							
-__global__ void pr_async(unsigned int numNodes,
-							unsigned int from,
-							unsigned int numPartitionedEdges,
-							unsigned int *activeNodes,
-							unsigned int *activeNodesPointer,
-							OutEdge *edgeList,
-							unsigned int *outDegree,
-							float *dist,
-							float *delta,
-							bool *finished,
-							float acc);	
-
-__global__ void clearLabel(unsigned int * activeNodes, bool *label, unsigned int size, unsigned int from);
-
-__global__ void mixLabels(unsigned int * activeNodes, bool *label1, bool *label2, unsigned int size, unsigned int from);
-
-__global__ void moveUpLabels(unsigned int * activeNodes, bool *label1, bool *label2, unsigned int size, unsigned int from);
+__global__ void bfs_kernel(uint numNodes,
+                           uint from,
+                           ull  numPartitionedEdges,
+                           uint *activeNodes,
+                           ull *activeNodesPointer,
+                           OutEdge *edgeList,
+                           uint *outDegree,
+                           uint *value,
+                           //bool *finished,
+                           bool *label1,
+                           bool *label2);
+
+__global__ void cc_kernel(uint numNodes,
+                          uint from,
+                          ull numPartitionedEdges,
+                          uint *activeNodes,
+                          ull *activeNodesPointer,
+                          OutEdge *edgeList,
+                          uint *outDegree,
+                          uint *dist,
+                          //bool *finished,
+                          bool *label1,
+                          bool *label2);
+
+__global__ void sssp_kernel(uint numNodes,
+                            uint from,
+                            ull numPartitionedEdges,
+                            uint *activeNodes,
+                            ull *activeNodesPointer,
+                            OutEdgeWeighted *edgeList,
+                            uint *outDegree,
+                            uint *dist,
+                            //bool *finished,
+                            bool *label1,
+                            bool *label2);
+
+__global__ void sswp_kernel(uint numNodes,
+                            uint from,
+                            ull numPartitionedEdges,
+                            uint *activeNodes,
+                            ull *activeNodesPointer,
+                            OutEdgeWeighted *edgeList,
+                            uint *outDegree,
+                            uint *dist,
+                            //bool *finished,
+                            bool *label1,
+                            bool *label2);
+
+__global__ void pr_kernel(uint numNodes,
+                          uint from,
+                          ull numPartitionedEdges,
+                          uint *activeNodes,
+                          ull *activeNodesPointer,
+                          OutEdge *edgeList,
+                          uint *outDegree,
+                          float *dist,
+                          float *delta,
+                          //bool *finished,
+                         float acc);
+
+__global__ void bfs_async(uint numNodes,
+                          uint from,
+                          ull numPartitionedEdges,
+                          uint *activeNodes,
+                          ull *activeNodesPointer,
+                          OutEdge *edgeList,
+                          uint *outDegree,
+                          uint *dist,
+                          bool *finished,
+                          bool *label1,
+                          bool *label2);
+
+__global__ void sssp_async(uint numNodes,
+                           uint from,
+                           ull numPartitionedEdges,
+                           uint *activeNodes,
+                           ull *activeNodesPointer,
+                           OutEdgeWeighted *edgeList,
+                           uint *outDegree,
+                           uint *dist,
+                           bool *finished,
+                           bool *label1,
+                           bool *label2);
+
+
+__global__ void sswp_async(uint numNodes,
+                           uint from,
+                           ull numPartitionedEdges,
+                           uint *activeNodes,
+                           ull *activeNodesPointer,
+                           OutEdgeWeighted *edgeList,
+                           uint *outDegree,
+                           uint *dist,
+                           bool *finished,
+                           bool *label1,
+                           bool *label2);
+
+
+__global__ void cc_async(uint numNodes,
+                         uint from,
+                         ull numPartitionedEdges,
+                         uint *activeNodes,
+                         ull *activeNodesPointer,
+                         OutEdge *edgeList,
+                         uint *outDegree,
+                         uint *dist,
+                         bool *finished,
+                         bool *label1,
+                         bool *label2);
+
+__global__ void pr_async(uint numNodes,
+                         uint from,
+                         ull numPartitionedEdges,
+                         uint *activeNodes,
+                         ull *activeNodesPointer,
+                         OutEdge *edgeList,
+                         uint *outDegree,
+                         float *dist,
+                         float *delta,
+                         bool *finished,
+                         float acc);
+
+__global__ void clearLabel(uint * activeNodes, bool *label, uint size, uint from);
 
+__global__ void mixLabels(uint * activeNodes, bool *label1, bool *label2, uint size, uint from);
 
+__global__ void moveUpLabels(uint * activeNodes, bool *label1, bool *label2, uint size, uint from);
diff --git a/shared/graph.cu b/shared/graph.cu
index 7d66c1d..46f7f3e 100644
--- a/shared/graph.cu
+++ b/shared/graph.cu
@@ -4,8 +4,8 @@
 template <class E>
 Graph<E>::Graph(string graphFilePath, bool isWeighted)
 {
-	this->graphFilePath = graphFilePath;
-	this->isWeighted = isWeighted;
+    this->graphFilePath = graphFilePath;
+    this->isWeighted = isWeighted;
 }
 
 template <class E>
@@ -31,169 +31,42 @@ void Graph<OutEdge>::AssignW8(uint w8, uint index)
 template <class E>
 void Graph<E>::ReadGraph()
 {
+    cout << "Reading the input graph from the following file:\n>> " << graphFilePath << endl;
+    this->graphFormat = GetFileExtension(graphFilePath);
+    if(graphFormat == "bcsr" || graphFormat == "bwcsr")
+    {
+        ifstream infile (graphFilePath, ios::in | ios::binary);
+        infile.read ((char*)&num_nodes, sizeof(uint));
+        infile.read ((char*)&num_edges, sizeof(ull));
+        nodePointer = new ull[num_nodes+1];
+        gpuErrorcheck(cudaMallocHost(&edgeList, (num_edges) * sizeof(E)));
+        ull num_node = num_nodes;
+        infile.read ((char*)nodePointer, (num_node + 1 ) * sizeof(ull) );
+        infile.read ((char*)edgeList, num_edges * sizeof(E) );
+        std::cout << nodePointer[num_nodes] << std::endl;
+    }
+    else
+    {
+        cout << "The graph format is not supported!\n";
+        exit(-1);
+    }
+    outDegree  = new uint[num_nodes];
+    for(uint i=1; i<num_nodes; i++)
+        outDegree[i-1] = nodePointer[i] - nodePointer[i-1];
 
-	cout << "Reading the input graph from the following file:\n>> " << graphFilePath << endl;
-	
-	this->graphFormat = GetFileExtension(graphFilePath);
-	
-	if(graphFormat == "bcsr" || graphFormat == "bwcsr")
-	{
-		ifstream infile (graphFilePath, ios::in | ios::binary);
-	
-		infile.read ((char*)&num_nodes, sizeof(uint));
-		infile.read ((char*)&num_edges, sizeof(uint));
-		
-		nodePointer = new uint[num_nodes+1];
-		gpuErrorcheck(cudaMallocHost(&edgeList, (num_edges) * sizeof(E)));
-		
-		infile.read ((char*)nodePointer, sizeof(uint)*num_nodes);
-		infile.read ((char*)edgeList, sizeof(E)*num_edges);
-		nodePointer[num_nodes] = num_edges;
-	}
-	else if(graphFormat == "el" || graphFormat == "wel")
-	{
-		ifstream infile;
-		infile.open(graphFilePath);
-		stringstream ss;
-		uint max = 0;
-		string line;
-		uint edgeCounter = 0;
-		if(isWeighted)
-		{
-			vector<EdgeWeighted> edges;
-			EdgeWeighted newEdge;
-			while(getline( infile, line ))
-			{
-				ss.str("");
-				ss.clear();
-				ss << line;
-				
-				ss >> newEdge.source;
-				ss >> newEdge.end;
-				ss >> newEdge.w8;
-				
-				edges.push_back(newEdge);
-				edgeCounter++;
-				
-				if(max < newEdge.source)
-					max = newEdge.source;
-				if(max < newEdge.end)
-					max = newEdge.end;				
-			}
-			infile.close();
-			num_nodes = max + 1;
-			num_edges = edgeCounter;
-			nodePointer = new uint[num_nodes+1];
-			gpuErrorcheck(cudaMallocHost(&edgeList, (num_edges) * sizeof(E)));
-			uint *degree = new uint[num_nodes];
-			for(uint i=0; i<num_nodes; i++)
-				degree[i] = 0;
-			for(uint i=0; i<num_edges; i++)
-				degree[edges[i].source]++;
-			
-			uint counter=0;
-			for(uint i=0; i<num_nodes; i++)
-			{
-				nodePointer[i] = counter;
-				counter = counter + degree[i];
-			}
-			nodePointer[num_nodes] = num_edges;
-			uint *outDegreeCounter  = new uint[num_nodes];
-			uint location;  
-			for(uint i=0; i<num_edges; i++)
-			{
-				location = nodePointer[edges[i].source] + outDegreeCounter[edges[i].source];
-				edgeList[location].end = edges[i].end;
-				if(isWeighted)
-					AssignW8(edges[i].w8, location);
-					//edgeList[location].w8 = edges[i].w8;
-				outDegreeCounter[edges[i].source]++;  
-			}
-			edges.clear();
-			delete[] degree;
-			delete[] outDegreeCounter;
-			
-		}
-		else
-		{
-			vector<Edge> edges;
-			Edge newEdge;
-			while(getline( infile, line ))
-			{
-				ss.str("");
-				ss.clear();
-				ss << line;
-				
-				ss >> newEdge.source;
-				ss >> newEdge.end;
-				
-				edges.push_back(newEdge);
-				edgeCounter++;
-				
-				if(max < newEdge.source)
-					max = newEdge.source;
-				if(max < newEdge.end)
-					max = newEdge.end;				
-			}
-			infile.close();
-			num_nodes = max + 1;
-			num_edges = edgeCounter;
-			nodePointer = new uint[num_nodes+1];
-			gpuErrorcheck(cudaMallocHost(&edgeList, (num_edges) * sizeof(E)));
-			uint *degree = new uint[num_nodes];
-			for(uint i=0; i<num_nodes; i++)
-				degree[i] = 0;
-			for(uint i=0; i<num_edges; i++)
-				degree[edges[i].source]++;
-			
-			uint counter=0;
-			for(uint i=0; i<num_nodes; i++)
-			{
-				nodePointer[i] = counter;
-				counter = counter + degree[i];
-			}
-			nodePointer[num_nodes] = num_edges;
-			uint *outDegreeCounter  = new uint[num_nodes];
-			uint location;  
-			for(uint i=0; i<num_edges; i++)
-			{
-				location = nodePointer[edges[i].source] + outDegreeCounter[edges[i].source];
-				edgeList[location].end = edges[i].end;
-				//if(isWeighted)
-				//	edgeList[location].w8 = edges[i].w8;
-				outDegreeCounter[edges[i].source]++;  
-			}
-			edges.clear();
-			delete[] degree;
-			delete[] outDegreeCounter;						
-		}
-	}
-	else
-	{
-		cout << "The graph format is not supported!\n";
-		exit(-1);
-	}
-	
-	outDegree  = new unsigned int[num_nodes];
-	
-	for(uint i=1; i<num_nodes-1; i++)
-		outDegree[i-1] = nodePointer[i] - nodePointer[i-1];
-	outDegree[num_nodes-1] = num_edges - nodePointer[num_nodes-1];
-	
-	label1 = new bool[num_nodes];
-	label2 = new bool[num_nodes];
-	value  = new unsigned int[num_nodes];
-	
-	gpuErrorcheck(cudaMalloc(&d_outDegree, num_nodes * sizeof(unsigned int)));
-	gpuErrorcheck(cudaMalloc(&d_value, num_nodes * sizeof(unsigned int)));
-	gpuErrorcheck(cudaMalloc(&d_label1, num_nodes * sizeof(bool)));
-	gpuErrorcheck(cudaMalloc(&d_label2, num_nodes * sizeof(bool)));
-	
-	cout << "Done reading.\n";
-	cout << "Number of nodes = " << num_nodes << endl;
-	cout << "Number of edges = " << num_edges << endl;
-
+    outDegree[num_nodes-1] = num_edges - nodePointer[num_nodes-1];
+    label1 = new bool[num_nodes];
+    label2 = new bool[num_nodes];
+    value  = new uint[num_nodes];
 
+    ull n = num_nodes;
+    gpuErrorcheck(cudaMalloc(&d_outDegree, n * sizeof(uint)));
+    gpuErrorcheck(cudaMalloc(&d_value, n * sizeof(uint)));
+    gpuErrorcheck(cudaMalloc(&d_label1, n * sizeof(bool)));
+    gpuErrorcheck(cudaMalloc(&d_label2, n * sizeof(bool)));
+    cout << "Done reading.\n";
+    cout << "Number of nodes = " << num_nodes << endl;
+    cout << "Number of edges = " << num_edges << endl;
 }
 
 //--------------------------------------
@@ -201,8 +74,8 @@ void Graph<E>::ReadGraph()
 template <class E>
 GraphPR<E>::GraphPR(string graphFilePath, bool isWeighted)
 {
-	this->graphFilePath = graphFilePath;
-	this->isWeighted = isWeighted;
+    this->graphFilePath = graphFilePath;
+    this->isWeighted = isWeighted;
 }
 
 template <class E>
@@ -229,167 +102,155 @@ template <class E>
 void GraphPR<E>::ReadGraph()
 {
 
-	cout << "Reading the input graph from the following file:\n>> " << graphFilePath << endl;
-	
-	this->graphFormat = GetFileExtension(graphFilePath);
-	
-	if(graphFormat == "bcsr" || graphFormat == "bwcsr")
-	{
-		ifstream infile (graphFilePath, ios::in | ios::binary);
-	
-		infile.read ((char*)&num_nodes, sizeof(uint));
-		infile.read ((char*)&num_edges, sizeof(uint));
-		
-		nodePointer = new uint[num_nodes+1];
-		gpuErrorcheck(cudaMallocHost(&edgeList, (num_edges) * sizeof(E)));
-		
-		infile.read ((char*)nodePointer, sizeof(uint)*num_nodes);
-		infile.read ((char*)edgeList, sizeof(E)*num_edges);
-		nodePointer[num_nodes] = num_edges;
-	}
-	else if(graphFormat == "el" || graphFormat == "wel")
-	{
-		ifstream infile;
-		infile.open(graphFilePath);
-		stringstream ss;
-		uint max = 0;
-		string line;
-		uint edgeCounter = 0;
-		if(isWeighted)
-		{
-			vector<EdgeWeighted> edges;
-			EdgeWeighted newEdge;
-			while(getline( infile, line ))
-			{
-				ss.str("");
-				ss.clear();
-				ss << line;
-				
-				ss >> newEdge.source;
-				ss >> newEdge.end;
-				ss >> newEdge.w8;
-				
-				edges.push_back(newEdge);
-				edgeCounter++;
-				
-				if(max < newEdge.source)
-					max = newEdge.source;
-				if(max < newEdge.end)
-					max = newEdge.end;				
-			}
-			infile.close();
-			num_nodes = max + 1;
-			num_edges = edgeCounter;
-			nodePointer = new uint[num_nodes+1];
-			gpuErrorcheck(cudaMallocHost(&edgeList, (num_edges) * sizeof(E)));
-			uint *degree = new uint[num_nodes];
-			for(uint i=0; i<num_nodes; i++)
-				degree[i] = 0;
-			for(uint i=0; i<num_edges; i++)
-				degree[edges[i].source]++;
-			
-			uint counter=0;
-			for(uint i=0; i<num_nodes; i++)
-			{
-				nodePointer[i] = counter;
-				counter = counter + degree[i];
-			}
-			nodePointer[num_nodes] = num_edges;
-			uint *outDegreeCounter  = new uint[num_nodes];
-			uint location;  
-			for(uint i=0; i<num_edges; i++)
-			{
-				location = nodePointer[edges[i].source] + outDegreeCounter[edges[i].source];
-				edgeList[location].end = edges[i].end;
-				if(isWeighted)
-					AssignW8(edges[i].w8, location);
-					//edgeList[location].w8 = edges[i].w8;
-				outDegreeCounter[edges[i].source]++;  
-			}
-			edges.clear();
-			delete[] degree;
-			delete[] outDegreeCounter;
-			
-		}
-		else
-		{
-			vector<Edge> edges;
-			Edge newEdge;
-			while(getline( infile, line ))
-			{
-				ss.str("");
-				ss.clear();
-				ss << line;
-				
-				ss >> newEdge.source;
-				ss >> newEdge.end;
-				
-				edges.push_back(newEdge);
-				edgeCounter++;
-				
-				if(max < newEdge.source)
-					max = newEdge.source;
-				if(max < newEdge.end)
-					max = newEdge.end;				
-			}
-			infile.close();
-			num_nodes = max + 1;
-			num_edges = edgeCounter;
-			nodePointer = new uint[num_nodes+1];
-			gpuErrorcheck(cudaMallocHost(&edgeList, (num_edges) * sizeof(E)));
-			uint *degree = new uint[num_nodes];
-			for(uint i=0; i<num_nodes; i++)
-				degree[i] = 0;
-			for(uint i=0; i<num_edges; i++)
-				degree[edges[i].source]++;
-			
-			uint counter=0;
-			for(uint i=0; i<num_nodes; i++)
-			{
-				nodePointer[i] = counter;
-				counter = counter + degree[i];
-			}
-			nodePointer[num_nodes] = num_edges;
-			uint *outDegreeCounter  = new uint[num_nodes];
-			uint location;  
-			for(uint i=0; i<num_edges; i++)
-			{
-				location = nodePointer[edges[i].source] + outDegreeCounter[edges[i].source];
-				edgeList[location].end = edges[i].end;
-				//if(isWeighted)
-				//	edgeList[location].w8 = edges[i].w8;
-				outDegreeCounter[edges[i].source]++;  
-			}
-			edges.clear();
-			delete[] degree;
-			delete[] outDegreeCounter;						
-		}
-	}
-	else
-	{
-		cout << "The graph format is not supported!\n";
-		exit(-1);
-	}
-	
-	outDegree  = new unsigned int[num_nodes];
-	
-	for(uint i=1; i<num_nodes-1; i++)
-		outDegree[i-1] = nodePointer[i] - nodePointer[i-1];
-	outDegree[num_nodes-1] = num_edges - nodePointer[num_nodes-1];
-	
+    cout << "Reading the input graph from the following file:\n>> " << graphFilePath << endl;
+    this->graphFormat = GetFileExtension(graphFilePath);
+    if(graphFormat == "bcsr" || graphFormat == "bwcsr")
+    {
+        ifstream infile (graphFilePath, ios::in | ios::binary);
+        infile.read ((char*)&num_nodes, sizeof(uint));
+        infile.read ((char*)&num_edges, sizeof(ull));
+        nodePointer = new ull[num_nodes+1];
+        gpuErrorcheck(cudaMallocHost(&edgeList, (num_edges) * sizeof(E)));
+        ull n = num_nodes;
+        infile.read ((char*)nodePointer, (n+1) * sizeof(ull));
+        infile.read ((char*)edgeList, num_edges * sizeof(E));
+        nodePointer[num_nodes] = num_edges;
+    }
+    else if(graphFormat == "el" || graphFormat == "wel")
+    {
+        ifstream infile;
+        infile.open(graphFilePath);
+        stringstream ss;
+        uint max = 0;
+        string line;
+        ull edgeCounter = 0;
+        if(isWeighted)
+        {
+            vector<EdgeWeighted> edges;
+            EdgeWeighted newEdge;
+            while(getline( infile, line ))
+            {
+                ss.str("");
+                ss.clear();
+                ss << line;
+                ss >> newEdge.source;
+                ss >> newEdge.end;
+                ss >> newEdge.w8;
+                edges.push_back(newEdge);
+                edgeCounter++;
+                if(max < newEdge.source)
+                    max = newEdge.source;
+                if(max < newEdge.end)
+                    max = newEdge.end;
+            }
+            infile.close();
+            num_nodes = max + 1;
+            num_edges = edgeCounter;
+            nodePointer = new ull[num_nodes+1];
+            gpuErrorcheck(cudaMallocHost(&edgeList, (num_edges) * sizeof(E)));
+            uint *degree = new uint[num_nodes];
+            for(uint i=0; i<num_nodes; i++)
+                degree[i] = 0;
+            for(ull i=0; i<num_edges; i++)
+                degree[edges[i].source]++;
+            ull counter=0;
+            for(uint i=0; i<num_nodes; i++)
+            {
+                nodePointer[i] = counter;
+                counter = counter + degree[i];
+            }
+            nodePointer[num_nodes] = num_edges;
+            uint *outDegreeCounter  = new uint[num_nodes];
+            for(uint i = 0; i < num_nodes; i++){
+                 outDegreeCounter[i] = 0;
+            }
+            ull location;  
+            for(ull i=0; i<num_edges; i++)
+            {
+                location = nodePointer[edges[i].source] + outDegreeCounter[edges[i].source];
+                edgeList[location].end = edges[i].end;
+                if(isWeighted)
+                    AssignW8(edges[i].w8, location);
+                    //edgeList[location].w8 = edges[i].w8;
+                outDegreeCounter[edges[i].source]++;  
+            }
+            edges.clear();
+            delete[] degree;
+            delete[] outDegreeCounter;
+        }
+        else
+        {
+            vector<Edge> edges;
+            Edge newEdge;
+            while(getline( infile, line ))
+            {
+                ss.str("");
+                ss.clear();
+                ss << line;
+                ss >> newEdge.source;
+                ss >> newEdge.end;
+                edges.push_back(newEdge);
+                edgeCounter++;
+                if(max < newEdge.source)
+                    max = newEdge.source;
+                if(max < newEdge.end)
+                    max = newEdge.end;
+            }
+            infile.close();
+            num_nodes = max + 1;
+            num_edges = edgeCounter;
+            nodePointer = new ull[num_nodes+1];
+            gpuErrorcheck(cudaMallocHost(&edgeList, (num_edges) * sizeof(E)));
+            uint *degree = new uint[num_nodes];
+            for(uint i=0; i<num_nodes; i++)
+                degree[i] = 0;
+            for(ull i=0; i<num_edges; i++)
+                degree[edges[i].source]++;
+            ull counter=0;
+            for(uint i=0; i<num_nodes; i++)
+            {
+                nodePointer[i] = counter;
+                counter = counter + degree[i];
+            }
+            nodePointer[num_nodes] = num_edges;
+            uint *outDegreeCounter  = new uint[num_nodes];
+            for (uint i = 0; i < num_nodes; i++){
+                 outDegreeCounter[i] = 0;
+            }
+            ull location;  
+            for(ull i=0; i<num_edges; i++)
+            {
+                location = nodePointer[edges[i].source] + outDegreeCounter[edges[i].source];
+                edgeList[location].end = edges[i].end;
+                //if(isWeighted)
+                //    edgeList[location].w8 = edges[i].w8;
+                outDegreeCounter[edges[i].source]++;  
+            }
+            edges.clear();
+            delete[] degree;
+            delete[] outDegreeCounter;
+        }
+    }
+    else
+    {
+        cout << "The graph format is not supported!\n";
+        exit(-1);
+    }
+    outDegree  = new uint[num_nodes]();
+    for(uint i=1; i<num_nodes; i++)
+        outDegree[i-1] = nodePointer[i] - nodePointer[i-1];
+    outDegree[num_nodes-1] = num_edges - nodePointer[num_nodes-1];
+
+    std::cout << "last nodePointer: " << nodePointer[num_nodes] << std::endl;
 
-	value  = new float[num_nodes];
-	delta  = new float[num_nodes];
-	
-	gpuErrorcheck(cudaMalloc(&d_outDegree, num_nodes * sizeof(unsigned int)));
-	gpuErrorcheck(cudaMalloc(&d_value, num_nodes * sizeof(float)));
-	gpuErrorcheck(cudaMalloc(&d_delta, num_nodes * sizeof(float)));
-	
-	
-	cout << "Done reading.\n";
-	cout << "Number of nodes = " << num_nodes << endl;
-	cout << "Number of edges = " << num_edges << endl;
-	
+    value  = new float[num_nodes];
+    delta  = new float[num_nodes];
+    gpuErrorcheck(cudaMalloc(&d_outDegree, num_nodes * sizeof(uint)));
+    gpuErrorcheck(cudaMalloc(&d_value, num_nodes * sizeof(float)));
+    gpuErrorcheck(cudaMalloc(&d_delta, num_nodes * sizeof(float)));
+    cout << "Done reading.\n";
+    cout << "Number of nodes = " << num_nodes << endl;
+    cout << "Number of edges = " << num_edges << endl;
 
 }
 
diff --git a/shared/graph.cuh b/shared/graph.cuh
index e90932c..c9dc8d9 100644
--- a/shared/graph.cuh
+++ b/shared/graph.cuh
@@ -10,22 +10,22 @@ class Graph
 private:
 
 public:
-	string graphFilePath;
-	bool isWeighted;
-	bool isLarge;
-	uint num_nodes;
-	uint num_edges;
-	uint *nodePointer;
-	E *edgeList;
-	uint *outDegree;
-	bool *label1;
-	bool *label2;
-	uint *value;
-	uint *d_outDegree;
-	uint *d_value;
-	bool *d_label1;
-	bool *d_label2;
-	string graphFormat;
+    string graphFilePath;
+    bool isWeighted;
+    bool isLarge;
+    uint num_nodes;
+    ull num_edges;
+    ull *nodePointer;
+    E *edgeList;
+    uint *outDegree;
+    bool *label1;
+    bool *label2;
+    uint *value;
+    uint *d_outDegree;
+    uint *d_value;
+    bool *d_label1;
+    bool *d_label2;
+    string graphFormat;
     Graph(string graphFilePath, bool isWeighted);
     string GetFileExtension(string fileName);
     void AssignW8(uint w8, uint index);
@@ -38,27 +38,27 @@ class GraphPR
 private:
 
 public:
-	string graphFilePath;
-	bool isWeighted;
-	bool isLarge;
-	uint num_nodes;
-	uint num_edges;
-	uint *nodePointer;
-	E *edgeList;
-	uint *outDegree;
-	float *value;
-	float *delta;
-	uint *d_outDegree;
-	float *d_value;
-	float *d_delta;
-	string graphFormat;
+    string graphFilePath;
+    bool isWeighted;
+    bool isLarge;
+    uint num_nodes;
+    ull num_edges;
+    ull *nodePointer;
+    E *edgeList;
+    uint *outDegree;
+    float *value;
+    float *delta;
+    uint *d_outDegree;
+    float *d_value;
+    float *d_delta;
+    string graphFormat;
     GraphPR(string graphFilePath, bool isWeighted);
     string GetFileExtension(string fileName);
     void AssignW8(uint w8, uint index);
     void ReadGraph();
 };
 
-#endif	//	GRAPH_CUH
+#endif    //    GRAPH_CUH
 
 
 
diff --git a/shared/partitioner.cu b/shared/partitioner.cu
index cad9937..0a154da 100644
--- a/shared/partitioner.cu
+++ b/shared/partitioner.cu
@@ -1,103 +1,94 @@
-
 #include "partitioner.cuh"
 #include "gpu_error_check.cuh"
 
 template <class E>
 Partitioner<E>::Partitioner()
 {
-	reset();
+    reset();
 }
 
 template <class E>
 void Partitioner<E>::partition(Subgraph<E> &subgraph, uint numActiveNodes)
 {
-	reset();
-	
-	unsigned int from, to;
-	unsigned int left, right, mid;
-	unsigned int partitionSize;
-	unsigned int numNodesInPartition;
-	unsigned int numPartitionedEdges;
-	bool foundTo;
-	unsigned int accurCount;
-	
-	
-	from = 0;
-	to = numActiveNodes; // last in pointers
-	numPartitionedEdges = 0;
-	
-	do
-	{
-		left = from;
-		right = numActiveNodes;
+    reset();
+    uint from, to;
+    uint left, right, mid;
+    ull partitionSize;
+    uint numNodesInPartition;
+    ull numPartitionedEdges;
+    bool foundTo;
+    ull accurCount;
+    from = 0;
+    to = numActiveNodes; // last in pointers
+    numPartitionedEdges = 0;
+    do
+    {
+        left = from;
+        right = numActiveNodes;
 
-		//cout << "#active nodes: " << numActiveNodes << endl;
-		//cout << "left: " << left << "    right: " << right << endl;
-		//cout << "pointer to left: " << subgraph.activeNodesPointer[left] << "    pointer to right: " << subgraph.activeNodesPointer[right] << endl;
+        std::cout << "#active nodes: " << numActiveNodes << std::endl;
+        std::cout << "left: " << left << "    right: " << right << std::endl;
+        std::cout << "pointer to left: " << subgraph.activeNodesPointer[left] << "    pointer to right: " << subgraph.activeNodesPointer[right] << std::endl;
 
-		partitionSize = subgraph.activeNodesPointer[right] - subgraph.activeNodesPointer[left];
-		if(partitionSize <= subgraph.max_partition_size)
-		{
-			to = right;
-		}
-		else
-		{
-			foundTo = false;
-			accurCount = 10;
-			while(foundTo==false || accurCount>0)
-			{
-				mid = (left + right)/2;
-				partitionSize = subgraph.activeNodesPointer[mid] - subgraph.activeNodesPointer[from];
-				if(foundTo == true)
-					accurCount--;
-				if(partitionSize <= subgraph.max_partition_size)
-				{
-					left = mid;
-					to = mid;
-					foundTo = true;
-				}
-				else
-				{
-					right = mid;  
-				}
-			}
-			
+        partitionSize = subgraph.activeNodesPointer[right] - subgraph.activeNodesPointer[left];
+        //std::cout << "partitionSize: " << partitionSize << std::endl;
 
-			if(to == numActiveNodes)
-			{
-				cout << "Error in Partitioning...\n";
-				exit(-1);
-			}
+        if(partitionSize <= subgraph.max_partition_size)
+        {
+            to = right;
+        }
+        else
+        {
+            foundTo = false;
+            accurCount = 10;
+            while(foundTo==false || accurCount>0)
+            {
+                mid = (left + right)/2;
+                partitionSize = subgraph.activeNodesPointer[mid] - subgraph.activeNodesPointer[from];
+                if(foundTo == true)
+                    accurCount--;
+                if(partitionSize <= subgraph.max_partition_size)
+                {
+                    left = mid;
+                    to = mid;
+                    foundTo = true;
+                }
+                else
+                {
+                    right = mid;  
+                }
+            }
+            if(to == numActiveNodes)
+            {
+                cout << "Error in Partitioning...\n";
+                exit(-1);
+            }
 
-		}
+        }
 
-		partitionSize = subgraph.activeNodesPointer[to] - subgraph.activeNodesPointer[from];
-		numNodesInPartition = to - from;
+        partitionSize = subgraph.activeNodesPointer[to] - subgraph.activeNodesPointer[from];
+        numNodesInPartition = to - from;
 
-		//cout << "from: " << from << "   to: " << to << endl;
-		//cout << "#nodes in P: " << numNodesInPartition << "    #edges in P: " << partitionSize << endl;
-		
-		fromNode.push_back(from);
-		fromEdge.push_back(numPartitionedEdges);
-		partitionNodeSize.push_back(numNodesInPartition);
-		partitionEdgeSize.push_back(partitionSize);
-		
-		from = to;
-		numPartitionedEdges += partitionSize;
-	
-	} while (to != numActiveNodes);
-	
-	numPartitions = fromNode.size();
+        //std::cout << "from: " << from << "   to: " << to << std::endl;
+        //std::cout << "#nodes in P: " << numNodesInPartition << "    #edges in P: " << partitionSize << std::endl;
+        fromNode.push_back(from);
+        fromEdge.push_back(numPartitionedEdges);
+        partitionNodeSize.push_back(numNodesInPartition);
+        partitionEdgeSize.push_back(partitionSize);
+        from = to;
+        numPartitionedEdges += partitionSize;
+    } while (to != numActiveNodes);
+    numPartitions = fromNode.size();
 }
 
 template <class E>
 void Partitioner<E>::reset()
 {
-	fromNode.clear();
-	fromEdge.clear();
-	partitionNodeSize.clear();
-	partitionEdgeSize.clear();
-	numPartitions = 0;
+    fromNode.clear();
+    fromEdge.clear();
+    partitionNodeSize.clear();
+    partitionEdgeSize.clear();
+    numPartitions = 0;
 }
 
 template class Partitioner<OutEdge>;
diff --git a/shared/partitioner.cuh b/shared/partitioner.cuh
index 8a1e1fb..9118037 100644
--- a/shared/partitioner.cuh
+++ b/shared/partitioner.cuh
@@ -11,17 +11,17 @@ class Partitioner
 private:
 
 public:
-	uint numPartitions;
-	vector<uint> fromNode;
-	vector<uint> fromEdge;
-	vector<uint> partitionNodeSize;
-	vector<uint> partitionEdgeSize;
-	Partitioner();
+    uint numPartitions;
+    vector<uint> fromNode;
+    vector<ull> fromEdge;
+    vector<uint> partitionNodeSize;
+    vector<ull> partitionEdgeSize;
+    Partitioner();
     void partition(Subgraph<E> &subgraph, uint numActiveNodes);
     void reset();
 };
 
-#endif	//	PARTITIONER_CUH
+#endif    //    PARTITIONER_CUH
 
 
 
diff --git a/shared/subgraph.cu b/shared/subgraph.cu
index bef7f7c..09c6291 100644
--- a/shared/subgraph.cu
+++ b/shared/subgraph.cu
@@ -1,44 +1,50 @@
-
 #include "subgraph.cuh"
 #include "gpu_error_check.cuh"
 #include "graph.cuh"
 #include <cuda_profiler_api.h>
-
+#include <stdlib.h>
 
 template <class E>
-Subgraph<E>::Subgraph(uint num_nodes, uint num_edges)
+Subgraph<E>::Subgraph(uint num_nodes, ull num_edges)
 {
-	cudaProfilerStart();
-	cudaError_t error;
-	cudaDeviceProp dev;
-	int deviceID;
-	cudaGetDevice(&deviceID);
-	error = cudaGetDeviceProperties(&dev, deviceID);
-	if(error != cudaSuccess)
-	{
-		printf("Error: %s\n", cudaGetErrorString(error));
-		exit(-1);
-	}
-	cudaProfilerStop();
-	
-	max_partition_size = 0.9 * (dev.totalGlobalMem - 8*4*num_nodes) / sizeof(E);
-	//max_partition_size = 1000000000;
-	
-	if(max_partition_size > DIST_INFINITY)
-		max_partition_size = DIST_INFINITY;
-	
-	//cout << "Max Partition Size: " << max_partition_size << endl;
-	
-	this->num_nodes = num_nodes;
-	this->num_edges = num_edges;
-	
-	gpuErrorcheck(cudaMallocHost(&activeNodes, num_nodes * sizeof(uint)));
-	gpuErrorcheck(cudaMallocHost(&activeNodesPointer, (num_nodes+1) * sizeof(uint)));
-	gpuErrorcheck(cudaMallocHost(&activeEdgeList, num_edges * sizeof(E)));
-	
-	gpuErrorcheck(cudaMalloc(&d_activeNodes, num_nodes * sizeof(unsigned int)));
-	gpuErrorcheck(cudaMalloc(&d_activeNodesPointer, (num_nodes+1) * sizeof(unsigned int)));
-	gpuErrorcheck(cudaMalloc(&d_activeEdgeList, (max_partition_size) * sizeof(E)));
+    cudaProfilerStart();
+    cudaError_t error;
+    cudaDeviceProp dev;
+    int deviceID;
+    cudaGetDevice(&deviceID);
+    error = cudaGetDeviceProperties(&dev, deviceID);
+    if(error != cudaSuccess)
+    {
+        printf("Error: %s\n", cudaGetErrorString(error));
+        exit(-1);
+    }
+    cudaProfilerStop();
+
+    std::cout << "num_edges: " << num_edges << std::endl;
+    std::cout << "device memory(bytes): " << dev.totalGlobalMem << std::endl;
+
+    float estimated_gpu_memory_size = float(num_nodes) * 20 * 4;
+    if (dev.totalGlobalMem > estimated_gpu_memory_size ){
+       max_partition_size = 0.9 * (dev.totalGlobalMem - estimated_gpu_memory_size) / sizeof(E);
+    }else {
+        std::cout << "no sufficient memory" << std::endl;
+        exit(-1);
+    }
+    //if(max_partition_size > DIST_INFINITY)
+    //  max_partition_size = DIST_INFINITY;
+    std::cout << "Max Partition Size: " << max_partition_size << std::endl;
+    this->num_nodes = num_nodes;
+    this->num_edges = num_edges;
+
+    ull m = num_nodes;
+    gpuErrorcheck(cudaMallocHost(&activeNodes, m * sizeof(uint)));
+    gpuErrorcheck(cudaMallocHost(&activeNodesPointer, (m+1) * sizeof(ull)));
+    gpuErrorcheck(cudaMallocHost(&activeEdgeList, num_edges * sizeof(E)));
+
+    gpuErrorcheck(cudaMalloc(&d_activeNodes, m * sizeof(uint)));
+    gpuErrorcheck(cudaMalloc(&d_activeNodesPointer, (m+1) * sizeof(ull)));
+    gpuErrorcheck(cudaMalloc(&d_activeEdgeList, (max_partition_size) * sizeof(E)));
+    std::cout << "subgraph .." << std::endl;
 }
 
 template class Subgraph<OutEdge>;
@@ -48,7 +54,7 @@ template class Subgraph<OutEdgeWeighted>;
 //unsigned int numActiveNodes = 1;
 //subgraph.activeNodes[0] = SOURCE_NODE;
 //for(unsigned int i=graph.nodePointer[SOURCE_NODE], j=0; i<graph.nodePointer[SOURCE_NODE] + graph.outDegree[SOURCE_NODE]; i++, j++)
-//	subgraph.activeEdgeList[j] = graph.edgeList[i];
+//    subgraph.activeEdgeList[j] = graph.edgeList[i];
 //subgraph.activeNodesPointer[0] = 0;
 //subgraph.activeNodesPointer[1] = graph.outDegree[SOURCE_NODE];
 //gpuErrorcheck(cudaMemcpy(subgraph.d_activeNodes, subgraph.activeNodes, numActiveNodes * sizeof(unsigned int), cudaMemcpyHostToDevice));
diff --git a/shared/subgraph.cuh b/shared/subgraph.cuh
index d3d31f4..9f32cd2 100644
--- a/shared/subgraph.cuh
+++ b/shared/subgraph.cuh
@@ -11,24 +11,17 @@ class Subgraph
 private:
 
 public:
-	uint num_nodes;
-	uint num_edges;
-	uint numActiveNodes;
-	
-	uint *activeNodes;
-	uint *activeNodesPointer;
-	E *activeEdgeList;
-	
-	uint *d_activeNodes;
-	uint *d_activeNodesPointer;
-	E *d_activeEdgeList;
-	
-	ull max_partition_size;
-	
-	Subgraph(uint num_nodes, uint num_edges);
+    uint num_nodes;
+    ull num_edges;
+    uint numActiveNodes;
+    uint *activeNodes;
+    ull *activeNodesPointer;
+    E *activeEdgeList;
+    uint *d_activeNodes;
+    ull *d_activeNodesPointer;
+    E *d_activeEdgeList;
+    ull max_partition_size;
+    Subgraph(uint num_nodes, ull num_edges);
 };
 
-#endif	//	SUBGRAPH_HPP
-
-
-
+#endif    //    SUBGRAPH_HPP
diff --git a/shared/subgraph_generator.cu b/shared/subgraph_generator.cu
index 62a4a8d..db47078 100644
--- a/shared/subgraph_generator.cu
+++ b/shared/subgraph_generator.cu
@@ -7,200 +7,181 @@ const unsigned int NUM_THREADS = 64;
 
 const unsigned int THRESHOLD_THREAD = 50000;
 
-__global__ void prePrefix(unsigned int *activeNodesLabeling, unsigned int *activeNodesDegree, 
-							unsigned int *outDegree, bool *label1, bool *label2, unsigned int numNodes)
+__global__ void prePrefix(uint *activeNodesLabeling, uint *activeNodesDegree, 
+                          uint *outDegree, bool *label1, bool *label2, uint numNodes)
 {
-	unsigned int id = blockDim.x * blockIdx.x + threadIdx.x;
-	if(id < numNodes){
-		activeNodesLabeling[id] = label1[id] || label2[id]; // label1 is always zero in sync
-		//activeNodesLabeling[id] = label[id];
-		//activeNodesLabeling[id] = 1;
-		activeNodesDegree[id] = 0;
-		if(activeNodesLabeling[id] == 1)
-			activeNodesDegree[id] = outDegree[id];	
-	}	
+    uint id = blockDim.x * blockIdx.x + threadIdx.x;
+    if(id < numNodes){
+        activeNodesLabeling[id] = label1[id] || label2[id]; // label1 is always zero in sync
+        //activeNodesLabeling[id] = label[id];
+        //activeNodesLabeling[id] = 1;
+        activeNodesDegree[id] = 0;
+        if(activeNodesLabeling[id] == 1)
+            activeNodesDegree[id] = outDegree[id];
+    }
 }
 
-__global__ void prePrefix(unsigned int *activeNodesLabeling, unsigned int *activeNodesDegree, 
-							unsigned int *outDegree, float *delta, unsigned int numNodes, float acc)
+__global__ void prePrefix(uint *activeNodesLabeling, uint *activeNodesDegree,
+                          uint *outDegree, float *delta, uint numNodes, float acc)
 {
-	unsigned int id = blockDim.x * blockIdx.x + threadIdx.x;
-	if(id < numNodes){
-		if(delta[id] > acc)
-		{
-			activeNodesLabeling[id] = 1;
-		}
-		else
-		{
-			activeNodesLabeling[id] = 0;
-		}
-		activeNodesDegree[id] = 0;
-		if(activeNodesLabeling[id] == 1)
-			activeNodesDegree[id] = outDegree[id];	
-	}
+    uint id = blockDim.x * blockIdx.x + threadIdx.x;
+    if(id < numNodes){
+        if(delta[id] > acc)
+        {
+            activeNodesLabeling[id] = 1;
+        }
+        else
+        {
+            activeNodesLabeling[id] = 0;
+        }
+        activeNodesDegree[id] = 0;
+        if(activeNodesLabeling[id] == 1)
+            activeNodesDegree[id] = outDegree[id];
+    }
 }
 
-__global__ void makeQueue(unsigned int *activeNodes, unsigned int *activeNodesLabeling,
-							unsigned int *prefixLabeling, unsigned int numNodes)
+__global__ void makeQueue(uint *activeNodes, uint *activeNodesLabeling,
+                          uint *prefixLabeling, uint numNodes)
 {
-	unsigned int id = blockDim.x * blockIdx.x + threadIdx.x;
-	if(id < numNodes && activeNodesLabeling[id] == 1){
-		activeNodes[prefixLabeling[id]] = id;
-	}
+    uint id = blockDim.x * blockIdx.x + threadIdx.x;
+    if(id < numNodes && activeNodesLabeling[id] == 1){
+        activeNodes[prefixLabeling[id]] = id;
+    }
 }
 
-__global__ void makeActiveNodesPointer(unsigned int *activeNodesPointer, unsigned int *activeNodesLabeling, 
-											unsigned int *prefixLabeling, unsigned int *prefixSumDegrees, 
-											unsigned int numNodes)
+__global__ void makeActiveNodesPointer(ull *activeNodesPointer, uint *activeNodesLabeling, 
+                                       uint *prefixLabeling, ull *prefixSumDegrees, 
+                                       uint numNodes)
 {
-	unsigned int id = blockDim.x * blockIdx.x + threadIdx.x;
-	if(id < numNodes && activeNodesLabeling[id] == 1){
-		activeNodesPointer[prefixLabeling[id]] = prefixSumDegrees[id];
-	}
+    uint id = blockDim.x * blockIdx.x + threadIdx.x;
+    if(id < numNodes && activeNodesLabeling[id] == 1){
+        activeNodesPointer[prefixLabeling[id]] = prefixSumDegrees[id];
+    }
 }
 
 // pthread
 template <class E>
-void dynamic(unsigned int tId,
-				unsigned int numThreads,	
-				unsigned int numActiveNodes,
-				unsigned int *activeNodes,
-				unsigned int *outDegree, 
-				unsigned int *activeNodesPointer,
-				unsigned int *nodePointer, 
-				E *activeEdgeList,
-				E *edgeList)
+void dynamic(uint tId,
+             uint numThreads,
+             uint numActiveNodes,
+             uint *activeNodes,
+             uint *outDegree, 
+             ull *activeNodesPointer,
+             ull *nodePointer, 
+             E *activeEdgeList,
+             E *edgeList)
 {
 
-	unsigned int chunkSize = ceil(numActiveNodes / numThreads);
-	unsigned int left, right;
-	left = tId * chunkSize;
-	right = min(left+chunkSize, numActiveNodes);	
-	
-	unsigned int thisNode;
-	unsigned int thisDegree;
-	unsigned int fromHere;
-	unsigned int fromThere;
-
-	for(unsigned int i=left; i<right; i++)
-	{
-		thisNode = activeNodes[i];
-		thisDegree = outDegree[thisNode];
-		fromHere = activeNodesPointer[i];
-		fromThere = nodePointer[thisNode];
-		for(unsigned int j=0; j<thisDegree; j++)
-		{
-			activeEdgeList[fromHere+j] = edgeList[fromThere+j];
-		}
-	}
-	
+    uint chunkSize = ceil(numActiveNodes / (double)numThreads);
+    uint left, right;
+    left = tId * chunkSize;
+    right = min(left+chunkSize, numActiveNodes);
+    uint thisNode;
+    uint thisDegree;
+    ull fromHere;
+    ull fromThere;
+
+    for(uint i=left; i<right; i++)
+    {
+        thisNode = activeNodes[i];
+        thisDegree = outDegree[thisNode];
+        fromHere = activeNodesPointer[i];
+        fromThere = nodePointer[thisNode];
+        for( uint j=0; j<thisDegree; j++)
+        {
+            activeEdgeList[fromHere+j] = edgeList[fromThere+j];
+        }
+    }
 }
 
 template <class E>
 SubgraphGenerator<E>::SubgraphGenerator(Graph<E> &graph)
 {
-	gpuErrorcheck(cudaMallocHost(&activeNodesLabeling, graph.num_nodes * sizeof(unsigned int)));
-	gpuErrorcheck(cudaMallocHost(&activeNodesDegree, graph.num_nodes * sizeof(unsigned int)));
-	gpuErrorcheck(cudaMallocHost(&prefixLabeling, graph.num_nodes * sizeof(unsigned int)));
-	gpuErrorcheck(cudaMallocHost(&prefixSumDegrees, (graph.num_nodes+1) * sizeof(unsigned int)));
-	
-	gpuErrorcheck(cudaMalloc(&d_activeNodesLabeling, graph.num_nodes * sizeof(unsigned int)));
-	gpuErrorcheck(cudaMalloc(&d_activeNodesDegree, graph.num_nodes * sizeof(unsigned int)));
-	gpuErrorcheck(cudaMalloc(&d_prefixLabeling, graph.num_nodes * sizeof(unsigned int)));
-	gpuErrorcheck(cudaMalloc(&d_prefixSumDegrees , (graph.num_nodes+1) * sizeof(unsigned int)));
+    ull l = graph.num_nodes;
+    gpuErrorcheck(cudaMallocHost(&activeNodesLabeling, l * sizeof(uint)));
+    gpuErrorcheck(cudaMallocHost(&activeNodesDegree, l * sizeof(uint)));
+    gpuErrorcheck(cudaMallocHost(&prefixLabeling, l * sizeof(uint)));
+    gpuErrorcheck(cudaMallocHost(&prefixSumDegrees, (l+1) * sizeof(ull)));
+
+    gpuErrorcheck(cudaMalloc(&d_activeNodesLabeling, l * sizeof(uint)));
+    gpuErrorcheck(cudaMalloc(&d_activeNodesDegree, l * sizeof(uint)));
+    gpuErrorcheck(cudaMalloc(&d_prefixLabeling, l * sizeof(uint)));
+    gpuErrorcheck(cudaMalloc(&d_prefixSumDegrees , (l+1) * sizeof(ull)));
 }
 
 template <class E>
 SubgraphGenerator<E>::SubgraphGenerator(GraphPR<E> &graph)
 {
-	gpuErrorcheck(cudaMallocHost(&activeNodesLabeling, graph.num_nodes * sizeof(unsigned int)));
-	gpuErrorcheck(cudaMallocHost(&activeNodesDegree, graph.num_nodes * sizeof(unsigned int)));
-	gpuErrorcheck(cudaMallocHost(&prefixLabeling, graph.num_nodes * sizeof(unsigned int)));
-	gpuErrorcheck(cudaMallocHost(&prefixSumDegrees, (graph.num_nodes+1) * sizeof(unsigned int)));
-	
-	gpuErrorcheck(cudaMalloc(&d_activeNodesLabeling, graph.num_nodes * sizeof(unsigned int)));
-	gpuErrorcheck(cudaMalloc(&d_activeNodesDegree, graph.num_nodes * sizeof(unsigned int)));
-	gpuErrorcheck(cudaMalloc(&d_prefixLabeling, graph.num_nodes * sizeof(unsigned int)));
-	gpuErrorcheck(cudaMalloc(&d_prefixSumDegrees , (graph.num_nodes+1) * sizeof(unsigned int)));
+    ull l = graph.num_nodes;
+    gpuErrorcheck(cudaMallocHost(&activeNodesLabeling, l * sizeof(uint)));
+    gpuErrorcheck(cudaMallocHost(&activeNodesDegree, l * sizeof(uint)));
+    gpuErrorcheck(cudaMallocHost(&prefixLabeling, l * sizeof(uint)));
+    gpuErrorcheck(cudaMallocHost(&prefixSumDegrees, (l+1) * sizeof(ull)));
+
+    gpuErrorcheck(cudaMalloc(&d_activeNodesLabeling, l * sizeof(uint)));
+    gpuErrorcheck(cudaMalloc(&d_activeNodesDegree, l * sizeof(uint)));
+    gpuErrorcheck(cudaMalloc(&d_prefixLabeling, l * sizeof(uint)));
+    gpuErrorcheck(cudaMalloc(&d_prefixSumDegrees , (l+1) * sizeof(ull)));
 }
 
 template <class E>
 void SubgraphGenerator<E>::generate(Graph<E> &graph, Subgraph<E> &subgraph)
 {
-	//std::chrono::time_point<std::chrono::system_clock> startDynG, finishDynG;
-	//startDynG = std::chrono::system_clock::now();
-	
-	prePrefix<<<graph.num_nodes/512+1, 512>>>(d_activeNodesLabeling, d_activeNodesDegree, graph.d_outDegree, graph.d_label1, graph.d_label2, graph.num_nodes);
-		
-	thrust::device_ptr<unsigned int> ptr_labeling(d_activeNodesLabeling);
-	thrust::device_ptr<unsigned int> ptr_labeling_prefixsum(d_prefixLabeling);
-	
-	subgraph.numActiveNodes = thrust::reduce(ptr_labeling, ptr_labeling + graph.num_nodes);
-	//cout << "Number of Active Nodes = " << subgraph.numActiveNodes << endl;
-				
-	thrust::exclusive_scan(ptr_labeling, ptr_labeling + graph.num_nodes, ptr_labeling_prefixsum);
-	
-	makeQueue<<<graph.num_nodes/512+1, 512>>>(subgraph.d_activeNodes, d_activeNodesLabeling, d_prefixLabeling, graph.num_nodes);
-	
-	gpuErrorcheck(cudaMemcpy(subgraph.activeNodes, subgraph.d_activeNodes, subgraph.numActiveNodes*sizeof(unsigned int), cudaMemcpyDeviceToHost));
-	
-	thrust::device_ptr<unsigned int> ptr_degrees(d_activeNodesDegree);
-	thrust::device_ptr<unsigned int> ptr_degrees_prefixsum(d_prefixSumDegrees);
-	
-	thrust::exclusive_scan(ptr_degrees, ptr_degrees + graph.num_nodes, ptr_degrees_prefixsum);
-	
-	makeActiveNodesPointer<<<graph.num_nodes/512+1, 512>>>(subgraph.d_activeNodesPointer, d_activeNodesLabeling, d_prefixLabeling, d_prefixSumDegrees, graph.num_nodes);
-	gpuErrorcheck(cudaMemcpy(subgraph.activeNodesPointer, subgraph.d_activeNodesPointer, subgraph.numActiveNodes*sizeof(unsigned int), cudaMemcpyDeviceToHost));
-	
-	unsigned int numActiveEdges = 0;
-	if(subgraph.numActiveNodes>0)
-		numActiveEdges = subgraph.activeNodesPointer[subgraph.numActiveNodes-1] + graph.outDegree[subgraph.activeNodes[subgraph.numActiveNodes-1]];	
-	
-	unsigned int last = numActiveEdges;
-	gpuErrorcheck(cudaMemcpy(subgraph.d_activeNodesPointer+subgraph.numActiveNodes, &last, sizeof(unsigned int), cudaMemcpyHostToDevice));
-	
-	gpuErrorcheck(cudaMemcpy(subgraph.activeNodesPointer, subgraph.d_activeNodesPointer, (subgraph.numActiveNodes+1)*sizeof(unsigned int), cudaMemcpyDeviceToHost));
-	
-	
-	//finishDynG = std::chrono::system_clock::now();
-	//std::chrono::duration<double> elapsed_seconds_dyng = finishDynG-startDynG;
-	//std::time_t finish_time_dyng = std::chrono::system_clock::to_time_t(finishDynG);
-	//std::cout << "Dynamic GPU Time = " << elapsed_seconds_dyng.count() << std::endl;
-	
-	//td::chrono::time_point<std::chrono::system_clock> startDynC, finishDynC;
-	//startDynC = std::chrono::system_clock::now();
-	
-	unsigned int numThreads = NUM_THREADS;
-
-	if(subgraph.numActiveNodes < THRESHOLD_THREAD)
-		numThreads = 1;
-
-	thread runThreads[numThreads];
-	
-	for(unsigned int t=0; t<numThreads; t++)
-	{
-
-		runThreads[t] = thread(dynamic<E>,
-								t,
-								numThreads,
-								subgraph.numActiveNodes,
-								subgraph.activeNodes,
-								graph.outDegree, 
-								subgraph.activeNodesPointer,
-								graph.nodePointer, 
-								subgraph.activeEdgeList,
-								graph.edgeList);
-
-	}
-		
-	for(unsigned int t=0; t<numThreads; t++)
-		runThreads[t].join();
-	
-	//finishDynC = std::chrono::system_clock::now();
-	//std::chrono::duration<double> elapsed_seconds_dync = finishDynC-startDynC;
-	//std::time_t finish_time_dync = std::chrono::system_clock::to_time_t(finishDynC);
-	//std::cout << "Dynamic CPU Time = " << elapsed_seconds_dync.count() << std::endl;
-	
+    //std::chrono::time_point<std::chrono::system_clock> startDynG, finishDynG;
+    //startDynG = std::chrono::system_clock::now();
+    prePrefix<<<graph.num_nodes/512+1, 512>>>(d_activeNodesLabeling, d_activeNodesDegree, graph.d_outDegree, graph.d_label1, graph.d_label2, graph.num_nodes);
+    thrust::device_ptr<uint> ptr_labeling(d_activeNodesLabeling);
+    thrust::device_ptr<uint> ptr_labeling_prefixsum(d_prefixLabeling);
+    subgraph.numActiveNodes = thrust::reduce(ptr_labeling, ptr_labeling + graph.num_nodes);
+    //std::cout << "Number of Active Nodes = " << subgraph.numActiveNodes << std::endl;
+    thrust::exclusive_scan(ptr_labeling, ptr_labeling + graph.num_nodes, ptr_labeling_prefixsum);
+    makeQueue<<<graph.num_nodes/512+1, 512>>>(subgraph.d_activeNodes, d_activeNodesLabeling, d_prefixLabeling, graph.num_nodes);
+    gpuErrorcheck(cudaMemcpy(subgraph.activeNodes, subgraph.d_activeNodes, subgraph.numActiveNodes*sizeof(uint), cudaMemcpyDeviceToHost));
+    thrust::device_ptr<uint> ptr_degrees(d_activeNodesDegree);
+    thrust::device_ptr<ull> ptr_degrees_prefixsum(d_prefixSumDegrees);
+    thrust::exclusive_scan(ptr_degrees, ptr_degrees + graph.num_nodes, ptr_degrees_prefixsum);
+    makeActiveNodesPointer<<<graph.num_nodes/512+1, 512>>>(subgraph.d_activeNodesPointer, d_activeNodesLabeling, d_prefixLabeling, d_prefixSumDegrees, graph.num_nodes);
+    ull n = subgraph.numActiveNodes;
+    gpuErrorcheck(cudaMemcpy(subgraph.activeNodesPointer, subgraph.d_activeNodesPointer, n*sizeof(ull), cudaMemcpyDeviceToHost));
+    ull numActiveEdges = 0;
+    if(subgraph.numActiveNodes>0)
+        numActiveEdges = subgraph.activeNodesPointer[subgraph.numActiveNodes-1] + graph.outDegree[subgraph.activeNodes[subgraph.numActiveNodes-1]];
+    ull last = numActiveEdges;
+    gpuErrorcheck(cudaMemcpy(subgraph.d_activeNodesPointer+subgraph.numActiveNodes, &last, sizeof(ull), cudaMemcpyHostToDevice));
+    gpuErrorcheck(cudaMemcpy(subgraph.activeNodesPointer, subgraph.d_activeNodesPointer, (n+1)*sizeof(ull), cudaMemcpyDeviceToHost));
+    //finishDynG = std::chrono::system_clock::now();
+    //std::chrono::duration<double> elapsed_seconds_dyng = finishDynG-startDynG;
+    //std::time_t finish_time_dyng = std::chrono::system_clock::to_time_t(finishDynG);
+    //std::cout << "Dynamic GPU Time = " << elapsed_seconds_dyng.count() << std::endl;
+    //td::chrono::time_point<std::chrono::system_clock> startDynC, finishDynC;
+    //startDynC = std::chrono::system_clock::now();
+    uint numThreads = NUM_THREADS;
+
+    if(subgraph.numActiveNodes < THRESHOLD_THREAD)
+        numThreads = 1;
+
+    thread runThreads[numThreads];
+    for(uint t=0; t < numThreads; t++)
+    {
+
+        runThreads[t] = thread(dynamic<E>,
+                                t,
+                                numThreads,
+                                subgraph.numActiveNodes,
+                                subgraph.activeNodes,
+                                graph.outDegree, 
+                                subgraph.activeNodesPointer,
+                                graph.nodePointer, 
+                                subgraph.activeEdgeList,
+                                graph.edgeList);
+
+    }
+    for(uint t=0; t<numThreads; t++)
+        runThreads[t].join();
+    //finishDynC = std::chrono::system_clock::now();
+    //std::chrono::duration<double> elapsed_seconds_dync = finishDynC-startDynC;
+    //std::time_t finish_time_dync = std::chrono::system_clock::to_time_t(finishDynC);
+    //std::cout << "Dynamic CPU Time = " << elapsed_seconds_dync.count() << std::endl;
 }
 
 
@@ -208,80 +189,64 @@ void SubgraphGenerator<E>::generate(Graph<E> &graph, Subgraph<E> &subgraph)
 template <class E>
 void SubgraphGenerator<E>::generate(GraphPR<E> &graph, Subgraph<E> &subgraph, float acc)
 {
-	//std::chrono::time_point<std::chrono::system_clock> startDynG, finishDynG;
-	//startDynG = std::chrono::system_clock::now();
-	
-	prePrefix<<<graph.num_nodes/512+1, 512>>>(d_activeNodesLabeling, d_activeNodesDegree, graph.d_outDegree, graph.d_delta, graph.num_nodes, acc);
-		
-	thrust::device_ptr<unsigned int> ptr_labeling(d_activeNodesLabeling);
-	thrust::device_ptr<unsigned int> ptr_labeling_prefixsum(d_prefixLabeling);
-	
-	subgraph.numActiveNodes = thrust::reduce(ptr_labeling, ptr_labeling + graph.num_nodes);
-	//cout << "Number of Active Nodes = " << subgraph.numActiveNodes << endl;
-				
-	thrust::exclusive_scan(ptr_labeling, ptr_labeling + graph.num_nodes, ptr_labeling_prefixsum);
-	
-	makeQueue<<<graph.num_nodes/512+1, 512>>>(subgraph.d_activeNodes, d_activeNodesLabeling, d_prefixLabeling, graph.num_nodes);
-	
-	gpuErrorcheck(cudaMemcpy(subgraph.activeNodes, subgraph.d_activeNodes, subgraph.numActiveNodes*sizeof(unsigned int), cudaMemcpyDeviceToHost));
-	
-	thrust::device_ptr<unsigned int> ptr_degrees(d_activeNodesDegree);
-	thrust::device_ptr<unsigned int> ptr_degrees_prefixsum(d_prefixSumDegrees);
-	
-	thrust::exclusive_scan(ptr_degrees, ptr_degrees + graph.num_nodes, ptr_degrees_prefixsum);
-	
-	makeActiveNodesPointer<<<graph.num_nodes/512+1, 512>>>(subgraph.d_activeNodesPointer, d_activeNodesLabeling, d_prefixLabeling, d_prefixSumDegrees, graph.num_nodes);
-	gpuErrorcheck(cudaMemcpy(subgraph.activeNodesPointer, subgraph.d_activeNodesPointer, subgraph.numActiveNodes*sizeof(unsigned int), cudaMemcpyDeviceToHost));
-	
-	unsigned int numActiveEdges = 0;
-	if(subgraph.numActiveNodes>0)
-		numActiveEdges = subgraph.activeNodesPointer[subgraph.numActiveNodes-1] + graph.outDegree[subgraph.activeNodes[subgraph.numActiveNodes-1]];	
-	
-	unsigned int last = numActiveEdges;
-	gpuErrorcheck(cudaMemcpy(subgraph.d_activeNodesPointer+subgraph.numActiveNodes, &last, sizeof(unsigned int), cudaMemcpyHostToDevice));
-	
-	gpuErrorcheck(cudaMemcpy(subgraph.activeNodesPointer, subgraph.d_activeNodesPointer, (subgraph.numActiveNodes+1)*sizeof(unsigned int), cudaMemcpyDeviceToHost));
-	
-	
-	//finishDynG = std::chrono::system_clock::now();
-	//std::chrono::duration<double> elapsed_seconds_dyng = finishDynG-startDynG;
-	//std::time_t finish_time_dyng = std::chrono::system_clock::to_time_t(finishDynG);
-	//std::cout << "Dynamic GPU Time = " << elapsed_seconds_dyng.count() << std::endl;
-	
-	//td::chrono::time_point<std::chrono::system_clock> startDynC, finishDynC;
-	//startDynC = std::chrono::system_clock::now();
-	
-	unsigned int numThreads = NUM_THREADS;
-
-	if(subgraph.numActiveNodes < THRESHOLD_THREAD)
-		numThreads = 1;
-
-	thread runThreads[numThreads];
-	
-	for(unsigned int t=0; t<numThreads; t++)
-	{
-
-		runThreads[t] = thread(dynamic<E>,
-								t,
-								numThreads,
-								subgraph.numActiveNodes,
-								subgraph.activeNodes,
-								graph.outDegree, 
-								subgraph.activeNodesPointer,
-								graph.nodePointer, 
-								subgraph.activeEdgeList,
-								graph.edgeList);
-
-	}
-		
-	for(unsigned int t=0; t<numThreads; t++)
-		runThreads[t].join();
-	
-	//finishDynC = std::chrono::system_clock::now();
-	//std::chrono::duration<double> elapsed_seconds_dync = finishDynC-startDynC;
-	//std::time_t finish_time_dync = std::chrono::system_clock::to_time_t(finishDynC);
-	//std::cout << "Dynamic CPU Time = " << elapsed_seconds_dync.count() << std::endl;
-	
+    //std::chrono::time_point<std::chrono::system_clock> startDynG, finishDynG;
+    //startDynG = std::chrono::system_clock::now();
+    prePrefix<<<graph.num_nodes/512+1, 512>>>(d_activeNodesLabeling, d_activeNodesDegree, graph.d_outDegree, graph.d_delta, graph.num_nodes, acc);
+    thrust::device_ptr<uint> ptr_labeling(d_activeNodesLabeling);
+    thrust::device_ptr<uint> ptr_labeling_prefixsum(d_prefixLabeling);
+    subgraph.numActiveNodes = thrust::reduce(ptr_labeling, ptr_labeling + graph.num_nodes);
+    //cout << "Number of Active Nodes = " << subgraph.numActiveNodes << endl;
+    thrust::exclusive_scan(ptr_labeling, ptr_labeling + graph.num_nodes, ptr_labeling_prefixsum);
+    makeQueue<<<graph.num_nodes/512+1, 512>>>(subgraph.d_activeNodes, d_activeNodesLabeling, d_prefixLabeling, graph.num_nodes);
+
+
+    ull n = subgraph.numActiveNodes;
+
+    gpuErrorcheck(cudaMemcpy(subgraph.activeNodes, subgraph.d_activeNodes, n*sizeof(uint), cudaMemcpyDeviceToHost));
+    thrust::device_ptr<uint> ptr_degrees(d_activeNodesDegree);
+    thrust::device_ptr<ull> ptr_degrees_prefixsum(d_prefixSumDegrees);
+    thrust::exclusive_scan(ptr_degrees, ptr_degrees + graph.num_nodes, ptr_degrees_prefixsum);
+    makeActiveNodesPointer<<<graph.num_nodes/512+1, 512>>>(subgraph.d_activeNodesPointer, d_activeNodesLabeling, d_prefixLabeling, d_prefixSumDegrees, graph.num_nodes);
+    gpuErrorcheck(cudaMemcpy(subgraph.activeNodesPointer, subgraph.d_activeNodesPointer, n*sizeof(ull), cudaMemcpyDeviceToHost));
+    ull numActiveEdges = 0;
+    if(subgraph.numActiveNodes>0)
+        numActiveEdges = subgraph.activeNodesPointer[subgraph.numActiveNodes-1] + graph.outDegree[subgraph.activeNodes[subgraph.numActiveNodes-1]];    
+    ull last = numActiveEdges;
+    gpuErrorcheck(cudaMemcpy(subgraph.d_activeNodesPointer+subgraph.numActiveNodes, &last, sizeof(ull), cudaMemcpyHostToDevice));
+    gpuErrorcheck(cudaMemcpy(subgraph.activeNodesPointer, subgraph.d_activeNodesPointer, (n+1)*sizeof(ull), cudaMemcpyDeviceToHost));
+    //finishDynG = std::chrono::system_clock::now();
+    //std::chrono::duration<double> elapsed_seconds_dyng = finishDynG-startDynG;
+    //std::time_t finish_time_dyng = std::chrono::system_clock::to_time_t(finishDynG);
+    //std::cout << "Dynamic GPU Time = " << elapsed_seconds_dyng.count() << std::endl;
+    //td::chrono::time_point<std::chrono::system_clock> startDynC, finishDynC;
+    //startDynC = std::chrono::system_clock::now();
+    uint numThreads = NUM_THREADS;
+
+    if(subgraph.numActiveNodes < THRESHOLD_THREAD)
+        numThreads = 1;
+
+    thread runThreads[numThreads];
+    for(uint t=0; t<numThreads; t++)
+    {
+
+        runThreads[t] = thread(dynamic<E>,
+                                t,
+                                numThreads,
+                                subgraph.numActiveNodes,
+                                subgraph.activeNodes,
+                                graph.outDegree, 
+                                subgraph.activeNodesPointer,
+                                graph.nodePointer, 
+                                subgraph.activeEdgeList,
+                                graph.edgeList);
+
+    }
+    for(uint t=0; t<numThreads; t++)
+        runThreads[t].join();
+    //finishDynC = std::chrono::system_clock::now();
+    //std::chrono::duration<double> elapsed_seconds_dync = finishDynC-startDynC;
+    //std::time_t finish_time_dync = std::chrono::system_clock::to_time_t(finishDynC);
+    //std::cout << "Dynamic CPU Time = " << elapsed_seconds_dync.count() << std::endl;
 }
 
 template class SubgraphGenerator<OutEdge>;
diff --git a/shared/subgraph_generator.cuh b/shared/subgraph_generator.cuh
index 9f2f9c6..bf7e364 100644
--- a/shared/subgraph_generator.cuh
+++ b/shared/subgraph_generator.cuh
@@ -15,21 +15,21 @@ class SubgraphGenerator
 private:
 
 public:
-	unsigned int *activeNodesLabeling;
-	unsigned int *activeNodesDegree;
-	unsigned int *prefixLabeling;
-	unsigned int *prefixSumDegrees;
-	unsigned int *d_activeNodesLabeling;
-	unsigned int *d_activeNodesDegree;
-	unsigned int *d_prefixLabeling;
-	unsigned int *d_prefixSumDegrees;
-	SubgraphGenerator(Graph<E> &graph);
-	SubgraphGenerator(GraphPR<E> &graph);
-	void generate(Graph<E> &graph, Subgraph<E> &subgraph);
-	void generate(GraphPR<E> &graph, Subgraph<E> &subgraph, float acc);
+    uint *activeNodesLabeling;
+    uint *activeNodesDegree;
+    uint *prefixLabeling;
+    ull *prefixSumDegrees;
+    uint *d_activeNodesLabeling;
+    uint *d_activeNodesDegree;
+    uint *d_prefixLabeling;
+    ull *d_prefixSumDegrees;
+    SubgraphGenerator(Graph<E> &graph);
+    SubgraphGenerator(GraphPR<E> &graph);
+    void generate(Graph<E> &graph, Subgraph<E> &subgraph);
+    void generate(GraphPR<E> &graph, Subgraph<E> &subgraph, float acc);
 };
 
-#endif	//	SUBGRAPH_GENERATOR_HPP
+#endif    //    SUBGRAPH_GENERATOR_HPP
 
 
 
diff --git a/shared/subway_utilities.cpp b/shared/subway_utilities.cpp
index 605f333..ebd36e3 100644
--- a/shared/subway_utilities.cpp
+++ b/shared/subway_utilities.cpp
@@ -1,72 +1,71 @@
-
 #include "subway_utilities.hpp"
 
 void utilities::PrintResults(uint *results, uint n)
 {
-	cout << "Results of first "<< n << " nodes:\n[";
-	for(int i=0; i<n; i++)
-	{
-		if(i>0)
-			cout << " ";
-		cout << i << ":" << results[i];
-	}
-	cout << "]\n";
+    cout << "Results of first "<< n << " nodes:\n[";
+    for(int i=0; i<n; i++)
+    {
+        if(i>0)
+            cout << " ";
+        cout << i << ":" << results[i];
+    }
+    cout << "]\n";
 }
 
 void utilities::PrintResults(float *results, uint n)
 {
-	cout << "Results of first "<< n << " nodes:\n[";
-	for(int i=0; i<n; i++)
-	{
-		if(i>0)
-			cout << " ";
-		cout << i << ":" << results[i];
-	}
-	cout << "]\n";
+    cout << "Results of first "<< n << " nodes:\n[";
+    for(int i=0; i<n; i++)
+    {
+        if(i>0)
+            cout << " ";
+        cout << i << ":" << results[i];
+    }
+    cout << "]\n";
 }
 
 void utilities::PrintResults(double *results, uint n)
 {
-	cout << "Results of first "<< n << " nodes:\n[";
-	for(int i=0; i<n; i++)
-	{
-		if(i>0)
-			cout << " ";
-		cout << i << ":" << results[i];
-	}
-	cout << "]\n";
+    cout << "Results of first "<< n << " nodes:\n[";
+    for(int i=0; i<n; i++)
+    {
+        if(i>0)
+            cout << " ";
+        cout << i << ":" << results[i];
+    }
+    cout << "]\n";
 }
 
 void utilities::SaveResults(string filepath, uint *results, uint n)
 {
-	cout << "Saving the results into the following file:\n";
-	cout << ">> " << filepath << endl;
-	ofstream outfile;
-	outfile.open(filepath);
-	for(int i=0; i<n; i++)
-		outfile << i << " " << results[i] << endl;
-	outfile.close();
-	cout << "Done saving.\n";
+    cout << "Saving the results into the following file:\n";
+    cout << ">> " << filepath << endl;
+    ofstream outfile;
+    outfile.open(filepath);
+    for(int i=0; i<n; i++)
+        outfile << i << " " << results[i] << endl;
+    outfile.close();
+    cout << "Done saving.\n";
 }
 
 void utilities::SaveResults(string filepath, float *results, uint n)
 {
-	cout << "Saving the results into " << filepath << " ...... " << flush;
-	ofstream outfile;
-	outfile.open(filepath);
-	for(int i=0; i<n; i++)
-		outfile << i << " " << results[i] << endl;
-	outfile.close();
-	cout << " Completed.\n";
+    cout << "Saving the results into " << filepath << " ...... " << flush;
+    ofstream outfile;
+    outfile.open(filepath);
+    for(int i=0; i<n; i++)
+        outfile << i << " " << results[i] << endl;
+    outfile.close();
+    cout << " Completed.\n";
 }
 
 void utilities::SaveResults(string filepath, double *results, uint n)
 {
-	cout << "Saving the results into " << filepath << " ...... " << flush;
-	ofstream outfile;
-	outfile.open(filepath);
-	for(int i=0; i<n; i++)
-		outfile << i << " " << results[i] << endl;
-	outfile.close();
-	cout << " Completed.\n";
+    cout << "Saving the results into " << filepath << " ...... " << flush;
+    ofstream outfile;
+    outfile.open(filepath);
+    for(int i=0; i<n; i++)
+        outfile << i << " " << results[i] << endl;
+    outfile.close();
+    cout << " Completed.\n";
 }
diff --git a/shared/subway_utilities.hpp b/shared/subway_utilities.hpp
index c16f454..a3044bb 100644
--- a/shared/subway_utilities.hpp
+++ b/shared/subway_utilities.hpp
@@ -5,12 +5,12 @@
 #include "globals.hpp"
 
 namespace utilities {
-	void PrintResults(uint *results, uint n);
-	void PrintResults(float *results, uint n);
-	void PrintResults(double *results, uint n);
-	void SaveResults(string filepath, uint *results, uint n);
-	void SaveResults(string filepath, float *results, uint n);
-	void SaveResults(string filepath, double *results, uint n);
+    void PrintResults(uint *results, uint n);
+    void PrintResults(float *results, uint n);
+    void PrintResults(double *results, uint n);
+    void SaveResults(string filepath, uint *results, uint n);
+    void SaveResults(string filepath, float *results, uint n);
+    void SaveResults(string filepath, double *results, uint n);
 }
 
-#endif	//	SUBWAY_UTILITIES_HPP
+#endif    //    SUBWAY_UTILITIES_HPP
diff --git a/shared/test.cu b/shared/test.cu
index d4472e5..d7939e0 100644
--- a/shared/test.cu
+++ b/shared/test.cu
@@ -1,16 +1,15 @@
 #include "test.cuh"
 
-
 template <class T>
 Test<T>::Test()
 {
-	this->a = 1;
-	this->b = 1;
+    this->a = 1;
+    this->b = 1;
 }
 
 template <class T>
 int Test<T>::sum(int a, int b)
 {
-	return a + b;
+    return a + b;
 }
 
diff --git a/shared/test.cuh b/shared/test.cuh
index 1158070..563c964 100644
--- a/shared/test.cuh
+++ b/shared/test.cuh
@@ -7,10 +7,10 @@ class Test
 private:
 
 public:
-	int a;
-	int b;
+    int a;
+    int b;
     Test();
     int sum(int a, int b);
 };
 
-#endif	//	TEST_HPP
+#endif    //    TEST_HPP
diff --git a/shared/timer.cpp b/shared/timer.cpp
index a79e65c..3442c47 100644
--- a/shared/timer.cpp
+++ b/shared/timer.cpp
@@ -4,20 +4,20 @@
 
 void Timer::Start()
 {
-	//A = chrono::system_clock::now();
-	gettimeofday( &StartingTime, NULL );
+    //A = chrono::system_clock::now();
+    gettimeofday( &StartingTime, NULL );
 }
 
 
 float Timer::Finish()
 {
-	//B = std::chrono::system_clock::now();
-	//chrono::duration<double> elapsed_seconds = B - A;
-	//time_t finish_time = std::chrono::system_clock::to_time_t(B);
-	//cout << "title" << elapsed_seconds.count()*1000;
-	timeval PausingTime, ElapsedTime;
-	gettimeofday( &PausingTime, NULL );
-	timersub(&PausingTime, &StartingTime, &ElapsedTime);
-	float d = ElapsedTime.tv_sec*1000.0+ElapsedTime.tv_usec/1000.0;
-	return d;
+    //B = std::chrono::system_clock::now();
+    //chrono::duration<double> elapsed_seconds = B - A;
+    //time_t finish_time = std::chrono::system_clock::to_time_t(B);
+    //cout << "title" << elapsed_seconds.count()*1000;
+    timeval PausingTime, ElapsedTime;
+    gettimeofday( &PausingTime, NULL );
+    timersub(&PausingTime, &StartingTime, &ElapsedTime);
+    float d = ElapsedTime.tv_sec*1000.0+ElapsedTime.tv_usec/1000.0;
+    return d;
 }
diff --git a/shared/timer.hpp b/shared/timer.hpp
index ff324d1..f069172 100644
--- a/shared/timer.hpp
+++ b/shared/timer.hpp
@@ -10,14 +10,14 @@
 class Timer
 {
 private:
-	//chrono::time_point<chrono::system_clock> A, B;
-	timeval StartingTime;
+    //chrono::time_point<chrono::system_clock> A, B;
+    timeval StartingTime;
 public:
     void Start();
     float Finish();
 };
 
-#endif	//	TIMER_HPP
+#endif    //    TIMER_HPP
 
 
 
diff --git a/subway/Makefile b/subway/Makefile
index f56a951..bc4af04 100644
--- a/subway/Makefile
+++ b/subway/Makefile
@@ -1,8 +1,8 @@
 
 CC=g++
 NC=nvcc
-CFLAGS=-std=c++11 -O3
-NFLAGS=-arch=sm_60
+CFLAGS=-std=c++14 -O3
+#NFLAGS=-arch=sm_80
 
 SHARED=../shared
 
diff --git a/subway/bfs-async.cu b/subway/bfs-async.cu
index 07bad99..d44e23b 100644
--- a/subway/bfs-async.cu
+++ b/subway/bfs-async.cu
@@ -12,112 +12,89 @@
 
 int main(int argc, char** argv)
 {
-	cudaFree(0);
+    cudaFree(0);
 
-	ArgumentParser arguments(argc, argv, true, false);
-	
-	Timer timer;
-	timer.Start();
-	
-	Graph<OutEdge> graph(arguments.input, false);
-	graph.ReadGraph();
-	
-	float readtime = timer.Finish();
-	cout << "Graph Reading finished in " << readtime/1000 << " (s).\n";
-	
-	for(unsigned int i=0; i<graph.num_nodes; i++)
-	{
-		graph.value[i] = DIST_INFINITY;
-		graph.label1[i] = true;
-		graph.label2[i] = false;
-	}
-	graph.value[arguments.sourceNode] = 0;
-	//graph.label[arguments.sourceNode] = true;
+    ArgumentParser arguments(argc, argv, true, false);
+    Timer timer;
+    timer.Start();
+    Graph<OutEdge> graph(arguments.input, false);
+    graph.ReadGraph();
+    float readtime = timer.Finish();
+    cout << "Graph Reading finished in " << readtime/1000 << " (s).\n";
+    for(uint i=0; i<graph.num_nodes; i++)
+    {
+        graph.value[i] = DIST_INFINITY;
+        graph.label1[i] = true;
+        graph.label2[i] = false;
+    }
+    graph.value[arguments.sourceNode] = 0;
+    //graph.label[arguments.sourceNode] = true;
 
 
-	gpuErrorcheck(cudaMemcpy(graph.d_outDegree, graph.outDegree, graph.num_nodes * sizeof(unsigned int), cudaMemcpyHostToDevice));
-	gpuErrorcheck(cudaMemcpy(graph.d_value, graph.value, graph.num_nodes * sizeof(unsigned int), cudaMemcpyHostToDevice));
-	gpuErrorcheck(cudaMemcpy(graph.d_label1, graph.label1, graph.num_nodes * sizeof(bool), cudaMemcpyHostToDevice));
-	gpuErrorcheck(cudaMemcpy(graph.d_label2, graph.label2, graph.num_nodes * sizeof(bool), cudaMemcpyHostToDevice));
-	
-	Subgraph<OutEdge> subgraph(graph.num_nodes, graph.num_edges);
-	
-	SubgraphGenerator<OutEdge> subgen(graph);
-	
-	subgen.generate(graph, subgraph);
-	
-	for(unsigned int i=0; i<graph.num_nodes; i++)
-	{
-		graph.label1[i] = false;
-	}
-	graph.label1[arguments.sourceNode] = true;
-	gpuErrorcheck(cudaMemcpy(graph.d_label1, graph.label1, graph.num_nodes * sizeof(bool), cudaMemcpyHostToDevice));
+    gpuErrorcheck(cudaMemcpy(graph.d_outDegree, graph.outDegree, graph.num_nodes * sizeof(uint), cudaMemcpyHostToDevice));
+    gpuErrorcheck(cudaMemcpy(graph.d_value, graph.value, graph.num_nodes * sizeof(uint), cudaMemcpyHostToDevice));
+    gpuErrorcheck(cudaMemcpy(graph.d_label1, graph.label1, graph.num_nodes * sizeof(bool), cudaMemcpyHostToDevice));
+    gpuErrorcheck(cudaMemcpy(graph.d_label2, graph.label2, graph.num_nodes * sizeof(bool), cudaMemcpyHostToDevice));
+    Subgraph<OutEdge> subgraph(graph.num_nodes, graph.num_edges);
+    SubgraphGenerator<OutEdge> subgen(graph);
+    subgen.generate(graph, subgraph);
+    for(uint i=0; i<graph.num_nodes; i++)
+    {
+        graph.label1[i] = false;
+    }
+    graph.label1[arguments.sourceNode] = true;
+    gpuErrorcheck(cudaMemcpy(graph.d_label1, graph.label1, graph.num_nodes * sizeof(bool), cudaMemcpyHostToDevice));
 
-	Partitioner<OutEdge> partitioner;
-	
-	timer.Start();
-	
-	unsigned int gItr = 0;
-	
-	bool finished;
-	bool *d_finished;
-	gpuErrorcheck(cudaMalloc(&d_finished, sizeof(bool)));
-		
-	while (subgraph.numActiveNodes>0)
-	{
-		gItr++;
-		
-		partitioner.partition(subgraph, subgraph.numActiveNodes);
-		// a super iteration
-		for(int i=0; i<partitioner.numPartitions; i++)
-		{
-			cudaDeviceSynchronize();
-			gpuErrorcheck(cudaMemcpy(subgraph.d_activeEdgeList, subgraph.activeEdgeList + partitioner.fromEdge[i], (partitioner.partitionEdgeSize[i]) * sizeof(OutEdge), cudaMemcpyHostToDevice));
-			cudaDeviceSynchronize();
+    Partitioner<OutEdge> partitioner;
+    timer.Start();
+    uint gItr = 0;
+    bool finished;
+    bool *d_finished;
+    gpuErrorcheck(cudaMalloc(&d_finished, sizeof(bool)));
+    while (subgraph.numActiveNodes>0)
+    {
+        gItr++;
+        partitioner.partition(subgraph, subgraph.numActiveNodes);
+        // a super iteration
+        for(int i=0; i<partitioner.numPartitions; i++)
+        {
+            cudaDeviceSynchronize();
+            gpuErrorcheck(cudaMemcpy(subgraph.d_activeEdgeList, subgraph.activeEdgeList + partitioner.fromEdge[i], (partitioner.partitionEdgeSize[i]) * sizeof(OutEdge), cudaMemcpyHostToDevice));
+            cudaDeviceSynchronize();
 
-			//moveUpLabels<<< partitioner.partitionNodeSize[i]/512 + 1 , 512 >>>(subgraph.d_activeNodes, graph.d_label, partitioner.partitionNodeSize[i], partitioner.fromNode[i]);
-			mixLabels<<<partitioner.partitionNodeSize[i]/512 + 1 , 512>>>(subgraph.d_activeNodes, graph.d_label1, graph.d_label2, partitioner.partitionNodeSize[i], partitioner.fromNode[i]);
-			
-			uint itr = 0;
-			do
-			{
-				itr++;
-				finished = true;
-				gpuErrorcheck(cudaMemcpy(d_finished, &finished, sizeof(bool), cudaMemcpyHostToDevice));
-				
-				bfs_async<<< partitioner.partitionNodeSize[i]/512 + 1 , 512 >>>(partitioner.partitionNodeSize[i],
-														partitioner.fromNode[i],
-														partitioner.fromEdge[i],
-														subgraph.d_activeNodes,
-														subgraph.d_activeNodesPointer,
-														subgraph.d_activeEdgeList,
-														graph.d_outDegree,
-														graph.d_value, 
-														d_finished,
-														(itr%2==1) ? graph.d_label1 : graph.d_label2,
-														(itr%2==1) ? graph.d_label2 : graph.d_label1);
+            //moveUpLabels<<< partitioner.partitionNodeSize[i]/512 + 1 , 512 >>>(subgraph.d_activeNodes, graph.d_label, partitioner.partitionNodeSize[i], partitioner.fromNode[i]);
+            mixLabels<<<partitioner.partitionNodeSize[i]/512 + 1 , 512>>>(subgraph.d_activeNodes, graph.d_label1, graph.d_label2, partitioner.partitionNodeSize[i], partitioner.fromNode[i]);
+            uint itr = 0;
+            do
+            {
+                itr++;
+                finished = true;
+                gpuErrorcheck(cudaMemcpy(d_finished, &finished, sizeof(bool), cudaMemcpyHostToDevice));
+                bfs_async<<< partitioner.partitionNodeSize[i]/512 + 1 , 512 >>>(partitioner.partitionNodeSize[i],
+                                                        partitioner.fromNode[i],
+                                                        partitioner.fromEdge[i],
+                                                        subgraph.d_activeNodes,
+                                                        subgraph.d_activeNodesPointer,
+                                                        subgraph.d_activeEdgeList,
+                                                        graph.d_outDegree,
+                                                        graph.d_value, 
+                                                        d_finished,
+                                                        (itr%2==1) ? graph.d_label1 : graph.d_label2,
+                                                        (itr%2==1) ? graph.d_label2 : graph.d_label1);
 
-				cudaDeviceSynchronize();
-				gpuErrorcheck( cudaPeekAtLastError() );
-				
-				gpuErrorcheck(cudaMemcpy(&finished, d_finished, sizeof(bool), cudaMemcpyDeviceToHost));
-			}while(!(finished));
-			
-			cout << itr << ((itr>1) ? " Inner Iterations" : " Inner Iteration") << " in Global Iteration " << gItr << ", Partition " << i  << endl;
-		}
-		
-		subgen.generate(graph, subgraph);
-			
-	}	
-	
-	float runtime = timer.Finish();
-	cout << "Processing finished in " << runtime/1000 << " (s).\n";
-	
-	gpuErrorcheck(cudaMemcpy(graph.value, graph.d_value, graph.num_nodes*sizeof(uint), cudaMemcpyDeviceToHost));
-	
-	utilities::PrintResults(graph.value, min(30, graph.num_nodes));
-			
-	if(arguments.hasOutput)
-		utilities::SaveResults(arguments.output, graph.value, graph.num_nodes);
+                cudaDeviceSynchronize();
+                gpuErrorcheck( cudaPeekAtLastError() );
+                gpuErrorcheck(cudaMemcpy(&finished, d_finished, sizeof(bool), cudaMemcpyDeviceToHost));
+            }while(!(finished));
+            cout << itr << ((itr>1) ? " Inner Iterations" : " Inner Iteration") << " in Global Iteration " << gItr << ", Partition " << i  << endl;
+        }
+        subgen.generate(graph, subgraph);
+    }
+    float runtime = timer.Finish();
+    cout << "Processing finished in " << runtime/1000 << " (s).\n";
+    gpuErrorcheck(cudaMemcpy(graph.value, graph.d_value, graph.num_nodes*sizeof(uint), cudaMemcpyDeviceToHost));
+    utilities::PrintResults(graph.value, min(30, graph.num_nodes));
+    if(arguments.hasOutput)
+        utilities::SaveResults(arguments.output, graph.value, graph.num_nodes);
 }
 
diff --git a/subway/bfs-sync.cu b/subway/bfs-sync.cu
index 68a5610..d5e6bdf 100644
--- a/subway/bfs-sync.cu
+++ b/subway/bfs-sync.cu
@@ -12,92 +12,79 @@
 
 int main(int argc, char** argv)
 {
-	cudaFree(0);
+    cudaFree(0);
 
-	ArgumentParser arguments(argc, argv, true, false);
-	
-	Timer timer;
-	timer.Start();
-	
-	Graph<OutEdge> graph(arguments.input, false);
-	graph.ReadGraph();
-	
-	float readtime = timer.Finish();
-	cout << "Graph Reading finished in " << readtime/1000 << " (s).\n";
-	
-	for(unsigned int i=0; i<graph.num_nodes; i++)
-	{
-		graph.value[i] = DIST_INFINITY;
-		graph.label1[i] = false;
-		graph.label2[i] = false;
-	}
-	graph.value[arguments.sourceNode] = 0;
-	graph.label1[arguments.sourceNode] = false;
-	graph.label2[arguments.sourceNode] = true;
+    ArgumentParser arguments(argc, argv, true, false);
+    
+    Timer timer;
+    timer.Start();
+    
+    Graph<OutEdge> graph(arguments.input, false);
+    graph.ReadGraph();
+    
+    float readtime = timer.Finish();
+    cout << "Graph Reading finished in " << readtime/1000 << " (s).\n";
+    
+    for(uint i=0; i<graph.num_nodes; i++)
+    {
+        graph.value[i] = DIST_INFINITY;
+        graph.label1[i] = false;
+        graph.label2[i] = false;
+    }
+    graph.value[arguments.sourceNode] = 0;
+    graph.label1[arguments.sourceNode] = false;
+    graph.label2[arguments.sourceNode] = true;
 
 
-	gpuErrorcheck(cudaMemcpy(graph.d_outDegree, graph.outDegree, graph.num_nodes * sizeof(unsigned int), cudaMemcpyHostToDevice));
-	gpuErrorcheck(cudaMemcpy(graph.d_value, graph.value, graph.num_nodes * sizeof(unsigned int), cudaMemcpyHostToDevice));
-	gpuErrorcheck(cudaMemcpy(graph.d_label1, graph.label1, graph.num_nodes * sizeof(bool), cudaMemcpyHostToDevice));
-	gpuErrorcheck(cudaMemcpy(graph.d_label2, graph.label2, graph.num_nodes * sizeof(bool), cudaMemcpyHostToDevice));
-	
-	Subgraph<OutEdge> subgraph(graph.num_nodes, graph.num_edges);
-	
-	SubgraphGenerator<OutEdge> subgen(graph);
-	
-	subgen.generate(graph, subgraph);
+    gpuErrorcheck(cudaMemcpy(graph.d_outDegree, graph.outDegree, graph.num_nodes * sizeof(uint), cudaMemcpyHostToDevice));
+    gpuErrorcheck(cudaMemcpy(graph.d_value, graph.value, graph.num_nodes * sizeof(uint), cudaMemcpyHostToDevice));
+    gpuErrorcheck(cudaMemcpy(graph.d_label1, graph.label1, graph.num_nodes * sizeof(bool), cudaMemcpyHostToDevice));
+    gpuErrorcheck(cudaMemcpy(graph.d_label2, graph.label2, graph.num_nodes * sizeof(bool), cudaMemcpyHostToDevice));
+    
+    Subgraph<OutEdge> subgraph(graph.num_nodes, graph.num_edges);
+    SubgraphGenerator<OutEdge> subgen(graph);
+    subgen.generate(graph, subgraph);
 
 
-	Partitioner<OutEdge> partitioner;
-	
-	timer.Start();
-	
-	uint itr = 0;
-		
-	while (subgraph.numActiveNodes>0)
-	{
-		itr++;
-		
-		partitioner.partition(subgraph, subgraph.numActiveNodes);
-		// a super iteration
-		for(int i=0; i<partitioner.numPartitions; i++)
-		{
-			cudaDeviceSynchronize();
-			gpuErrorcheck(cudaMemcpy(subgraph.d_activeEdgeList, subgraph.activeEdgeList + partitioner.fromEdge[i], (partitioner.partitionEdgeSize[i]) * sizeof(OutEdge), cudaMemcpyHostToDevice));
-			cudaDeviceSynchronize();
+    Partitioner<OutEdge> partitioner;
+    timer.Start();
+    uint itr = 0;
+    while (subgraph.numActiveNodes>0)
+    {
+        itr++;
+        partitioner.partition(subgraph, subgraph.numActiveNodes);
+        // a super iteration
+        for(int i=0; i<partitioner.numPartitions; i++)
+        {
+            cudaDeviceSynchronize();
+            gpuErrorcheck(cudaMemcpy(subgraph.d_activeEdgeList, subgraph.activeEdgeList + partitioner.fromEdge[i], (partitioner.partitionEdgeSize[i]) * sizeof(OutEdge), cudaMemcpyHostToDevice));
+            cudaDeviceSynchronize();
 
-			moveUpLabels<<< partitioner.partitionNodeSize[i]/512 + 1 , 512 >>>(subgraph.d_activeNodes, graph.d_label1, graph.d_label2, partitioner.partitionNodeSize[i], partitioner.fromNode[i]);
+            moveUpLabels<<< partitioner.partitionNodeSize[i]/512 + 1 , 512 >>>(subgraph.d_activeNodes, graph.d_label1, graph.d_label2, partitioner.partitionNodeSize[i], partitioner.fromNode[i]);
 
-			bfs_kernel<<< partitioner.partitionNodeSize[i]/512 + 1 , 512 >>>(partitioner.partitionNodeSize[i],
-													partitioner.fromNode[i],
-													partitioner.fromEdge[i],
-													subgraph.d_activeNodes,
-													subgraph.d_activeNodesPointer,
-													subgraph.d_activeEdgeList,
-													graph.d_outDegree,
-													graph.d_value, 
-													//d_finished,
-													graph.d_label1,
-													graph.d_label2);
+            bfs_kernel<<< partitioner.partitionNodeSize[i]/512 + 1 , 512 >>>(partitioner.partitionNodeSize[i],
+                                                    partitioner.fromNode[i],
+                                                    partitioner.fromEdge[i],
+                                                    subgraph.d_activeNodes,
+                                                    subgraph.d_activeNodesPointer,
+                                                    subgraph.d_activeEdgeList,
+                                                    graph.d_outDegree,
+                                                    graph.d_value, 
+                                                    //d_finished,
+                                                    graph.d_label1,
+                                                    graph.d_label2);
 
-			cudaDeviceSynchronize();
-			gpuErrorcheck( cudaPeekAtLastError() );	
-		}
-		
-		subgen.generate(graph, subgraph);
-			
-	}
-	
-	float runtime = timer.Finish();
-	cout << "Processing finished in " << runtime/1000 << " (s).\n";
-	
-	cout << "Number of iterations = " << itr << endl;
-	
-	gpuErrorcheck(cudaMemcpy(graph.value, graph.d_value, graph.num_nodes*sizeof(uint), cudaMemcpyDeviceToHost));
-	
-	utilities::PrintResults(graph.value, min(30, graph.num_nodes));
-			
-	if(arguments.hasOutput)
-		utilities::SaveResults(arguments.output, graph.value, graph.num_nodes);
+            cudaDeviceSynchronize();
+            gpuErrorcheck( cudaPeekAtLastError() );    
+        }
+        subgen.generate(graph, subgraph);
+    }
+    float runtime = timer.Finish();
+    cout << "Processing finished in " << runtime/1000 << " (s).\n";
+    cout << "Number of iterations = " << itr << endl;
+    gpuErrorcheck(cudaMemcpy(graph.value, graph.d_value, graph.num_nodes*sizeof(uint), cudaMemcpyDeviceToHost));
+    utilities::PrintResults(graph.value, min(30, graph.num_nodes));
+    if(arguments.hasOutput)
+        utilities::SaveResults(arguments.output, graph.value, graph.num_nodes);
 }
 
diff --git a/subway/cc-async.cu b/subway/cc-async.cu
index 676b103..531726c 100644
--- a/subway/cc-async.cu
+++ b/subway/cc-async.cu
@@ -12,104 +12,84 @@
 
 int main(int argc, char** argv)
 {
-	cudaFree(0);
+    cudaFree(0);
 
-	ArgumentParser arguments(argc, argv, true, false);
-	
-	Timer timer;
-	timer.Start();
-	
-	Graph<OutEdge> graph(arguments.input, false);
-	graph.ReadGraph();
-	
-	float readtime = timer.Finish();
-	cout << "Graph Reading finished in " << readtime/1000 << " (s).\n";
-	
-	for(unsigned int i=0; i<graph.num_nodes; i++)
-	{
-		graph.value[i] = i;
-		graph.label1[i] = true;
-		graph.label2[i] = false;
-	}
+    ArgumentParser arguments(argc, argv, true, false);
+    Timer timer;
+    timer.Start();
+    Graph<OutEdge> graph(arguments.input, false);
+    graph.ReadGraph();
+    float readtime = timer.Finish();
+    cout << "Graph Reading finished in " << readtime/1000 << " (s).\n";
+    for(uint i=0; i<graph.num_nodes; i++)
+    {
+        graph.value[i] = i;
+        graph.label1[i] = true;
+        graph.label2[i] = false;
+    }
 
 
-	gpuErrorcheck(cudaMemcpy(graph.d_outDegree, graph.outDegree, graph.num_nodes * sizeof(unsigned int), cudaMemcpyHostToDevice));
-	gpuErrorcheck(cudaMemcpy(graph.d_value, graph.value, graph.num_nodes * sizeof(unsigned int), cudaMemcpyHostToDevice));
-	gpuErrorcheck(cudaMemcpy(graph.d_label1, graph.label1, graph.num_nodes * sizeof(bool), cudaMemcpyHostToDevice));
-	gpuErrorcheck(cudaMemcpy(graph.d_label2, graph.label2, graph.num_nodes * sizeof(bool), cudaMemcpyHostToDevice));
-	
-	Subgraph<OutEdge> subgraph(graph.num_nodes, graph.num_edges);
-	
-	SubgraphGenerator<OutEdge> subgen(graph);
-	
-	subgen.generate(graph, subgraph);
+    ull n = graph.num_nodes;
+    gpuErrorcheck(cudaMemcpy(graph.d_outDegree, graph.outDegree, n * sizeof(uint), cudaMemcpyHostToDevice));
+    gpuErrorcheck(cudaMemcpy(graph.d_value, graph.value, n * sizeof(uint), cudaMemcpyHostToDevice));
+    gpuErrorcheck(cudaMemcpy(graph.d_label1, graph.label1, n * sizeof(bool), cudaMemcpyHostToDevice));
+    gpuErrorcheck(cudaMemcpy(graph.d_label2, graph.label2, n * sizeof(bool), cudaMemcpyHostToDevice));
+    
+    Subgraph<OutEdge> subgraph(graph.num_nodes, graph.num_edges);
+    SubgraphGenerator<OutEdge> subgen(graph);
+    subgen.generate(graph, subgraph);
 
 
-	Partitioner<OutEdge> partitioner;
-	
-	timer.Start();
-	
-	unsigned int gItr = 0;
-	
-	bool finished;
-	bool *d_finished;
-	gpuErrorcheck(cudaMalloc(&d_finished, sizeof(bool)));
-		
-	while (subgraph.numActiveNodes>0)
-	{
-		gItr++;
-		
-		partitioner.partition(subgraph, subgraph.numActiveNodes);
-		// a super iteration
-		for(int i=0; i<partitioner.numPartitions; i++)
-		{
-			cudaDeviceSynchronize();
-			gpuErrorcheck(cudaMemcpy(subgraph.d_activeEdgeList, subgraph.activeEdgeList + partitioner.fromEdge[i], (partitioner.partitionEdgeSize[i]) * sizeof(OutEdge), cudaMemcpyHostToDevice));
-			cudaDeviceSynchronize();
+    Partitioner<OutEdge> partitioner;
+    timer.Start();
+    uint gItr = 0;
+    bool finished;
+    bool *d_finished;
+    gpuErrorcheck(cudaMalloc(&d_finished, sizeof(bool)));
+    while (subgraph.numActiveNodes>0)
+    {
+        gItr++;
+        partitioner.partition(subgraph, subgraph.numActiveNodes);
+        // a super iteration
+        for(int i=0; i<partitioner.numPartitions; i++)
+        {
+            cudaDeviceSynchronize();
+            gpuErrorcheck(cudaMemcpy(subgraph.d_activeEdgeList, subgraph.activeEdgeList + partitioner.fromEdge[i], (partitioner.partitionEdgeSize[i]) * sizeof(OutEdge), cudaMemcpyHostToDevice));
+            cudaDeviceSynchronize();
 
-			//moveUpLabels<<< partitioner.partitionNodeSize[i]/512 + 1 , 512 >>>(subgraph.d_activeNodes, graph.d_label, partitioner.partitionNodeSize[i], partitioner.fromNode[i]);
-			mixLabels<<<partitioner.partitionNodeSize[i]/512 + 1 , 512>>>(subgraph.d_activeNodes, graph.d_label1, graph.d_label2, partitioner.partitionNodeSize[i], partitioner.fromNode[i]);
-			
-			uint itr = 0;
-			do
-			{
-				itr++;
-				finished = true;
-				gpuErrorcheck(cudaMemcpy(d_finished, &finished, sizeof(bool), cudaMemcpyHostToDevice));
-				
-				cc_async<<< partitioner.partitionNodeSize[i]/512 + 1 , 512 >>>(partitioner.partitionNodeSize[i],
-														partitioner.fromNode[i],
-														partitioner.fromEdge[i],
-														subgraph.d_activeNodes,
-														subgraph.d_activeNodesPointer,
-														subgraph.d_activeEdgeList,
-														graph.d_outDegree,
-														graph.d_value, 
-														d_finished,
-														(itr%2==1) ? graph.d_label1 : graph.d_label2,
-														(itr%2==1) ? graph.d_label2 : graph.d_label1);
+            //moveUpLabels<<< partitioner.partitionNodeSize[i]/512 + 1 , 512 >>>(subgraph.d_activeNodes, graph.d_label, partitioner.partitionNodeSize[i], partitioner.fromNode[i]);
+            mixLabels<<<partitioner.partitionNodeSize[i]/512 + 1 , 512>>>(subgraph.d_activeNodes, graph.d_label1, graph.d_label2, partitioner.partitionNodeSize[i], partitioner.fromNode[i]);
+            uint itr = 0;
+            do
+            {
+                itr++;
+                finished = true;
+                gpuErrorcheck(cudaMemcpy(d_finished, &finished, sizeof(bool), cudaMemcpyHostToDevice));
+                cc_async<<< partitioner.partitionNodeSize[i]/512 + 1 , 512 >>>(partitioner.partitionNodeSize[i],
+                                                        partitioner.fromNode[i],
+                                                        partitioner.fromEdge[i],
+                                                        subgraph.d_activeNodes,
+                                                        subgraph.d_activeNodesPointer,
+                                                        subgraph.d_activeEdgeList,
+                                                        graph.d_outDegree,
+                                                        graph.d_value, 
+                                                        d_finished,
+                                                        (itr%2==1) ? graph.d_label1 : graph.d_label2,
+                                                        (itr%2==1) ? graph.d_label2 : graph.d_label1);
 
-				cudaDeviceSynchronize();
-				gpuErrorcheck( cudaPeekAtLastError() );
-				
-				gpuErrorcheck(cudaMemcpy(&finished, d_finished, sizeof(bool), cudaMemcpyDeviceToHost));
-			}while(!(finished));
-			
-			cout << itr << ((itr>1) ? " Inner Iterations" : " Inner Iteration") << " in Global Iteration " << gItr << ", Partition " << i  << endl;
-		}
-		
-		subgen.generate(graph, subgraph);
-			
-	}	
-	
-	float runtime = timer.Finish();
-	cout << "Processing finished in " << runtime/1000 << " (s).\n";
-	
-	gpuErrorcheck(cudaMemcpy(graph.value, graph.d_value, graph.num_nodes*sizeof(uint), cudaMemcpyDeviceToHost));
-	
-	utilities::PrintResults(graph.value, min(30, graph.num_nodes));
-			
-	if(arguments.hasOutput)
-		utilities::SaveResults(arguments.output, graph.value, graph.num_nodes);
+                cudaDeviceSynchronize();
+                gpuErrorcheck( cudaPeekAtLastError() );
+                gpuErrorcheck(cudaMemcpy(&finished, d_finished, sizeof(bool), cudaMemcpyDeviceToHost));
+            }while(!(finished));
+            cout << itr << ((itr>1) ? " Inner Iterations" : " Inner Iteration") << " in Global Iteration " << gItr << ", Partition " << i  << endl;
+        }
+        subgen.generate(graph, subgraph);
+    }
+    float runtime = timer.Finish();
+    cout << "Processing finished in " << runtime/1000 << " (s).\n";
+    gpuErrorcheck(cudaMemcpy(graph.value, graph.d_value, n*sizeof(uint), cudaMemcpyDeviceToHost));
+    utilities::PrintResults(graph.value, min(30, graph.num_nodes));
+    if(arguments.hasOutput)
+        utilities::SaveResults(arguments.output, graph.value, graph.num_nodes);
 }
 
diff --git a/subway/cc-sync.cu b/subway/cc-sync.cu
index 2d40951..f485d4a 100644
--- a/subway/cc-sync.cu
+++ b/subway/cc-sync.cu
@@ -12,89 +12,84 @@
 
 int main(int argc, char** argv)
 {
-	cudaFree(0);
-
-	ArgumentParser arguments(argc, argv, false, false);
-	
-	Timer timer;
-	timer.Start();
-	
-	Graph<OutEdge> graph(arguments.input, false);
-	graph.ReadGraph();
-	
-	float readtime = timer.Finish();
-	cout << "Graph Reading finished in " << readtime/1000 << " (s).\n";
-	
-
-	for(unsigned int i=0; i<graph.num_nodes; i++)
-	{
-		graph.value[i] = i;
-		graph.label1[i] = false;
-		graph.label2[i] = true;
-	}
-
-	gpuErrorcheck(cudaMemcpy(graph.d_outDegree, graph.outDegree, graph.num_nodes * sizeof(unsigned int), cudaMemcpyHostToDevice));
-	gpuErrorcheck(cudaMemcpy(graph.d_value, graph.value, graph.num_nodes * sizeof(unsigned int), cudaMemcpyHostToDevice));
-	gpuErrorcheck(cudaMemcpy(graph.d_label1, graph.label1, graph.num_nodes * sizeof(bool), cudaMemcpyHostToDevice));
-	gpuErrorcheck(cudaMemcpy(graph.d_label2, graph.label2, graph.num_nodes * sizeof(bool), cudaMemcpyHostToDevice));
-	
-	Subgraph<OutEdge> subgraph(graph.num_nodes, graph.num_edges);
-	
-	SubgraphGenerator<OutEdge> subgen(graph);
-	
-	subgen.generate(graph, subgraph);
-
-
-	Partitioner<OutEdge> partitioner;
-	
-	timer.Start();
-	
-	uint itr = 0;
-		
-	while (subgraph.numActiveNodes>0)
-	{
-		itr++;
-		
-		partitioner.partition(subgraph, subgraph.numActiveNodes);
-		// a super iteration
-		for(int i=0; i<partitioner.numPartitions; i++)
-		{
-			cudaDeviceSynchronize();
-			gpuErrorcheck(cudaMemcpy(subgraph.d_activeEdgeList, subgraph.activeEdgeList + partitioner.fromEdge[i], (partitioner.partitionEdgeSize[i]) * sizeof(OutEdge), cudaMemcpyHostToDevice));
-			cudaDeviceSynchronize();
-
-			moveUpLabels<<< partitioner.partitionNodeSize[i]/512 + 1 , 512 >>>(subgraph.d_activeNodes, graph.d_label1, graph.d_label2, partitioner.partitionNodeSize[i], partitioner.fromNode[i]);
-
-			cc_kernel<<< partitioner.partitionNodeSize[i]/512 + 1 , 512 >>>(partitioner.partitionNodeSize[i],
-													partitioner.fromNode[i],
-													partitioner.fromEdge[i],
-													subgraph.d_activeNodes,
-													subgraph.d_activeNodesPointer,
-													subgraph.d_activeEdgeList,
-													graph.d_outDegree,
-													graph.d_value, 
-													//d_finished,
-													graph.d_label1,
-													graph.d_label2);
-
-			cudaDeviceSynchronize();
-			gpuErrorcheck( cudaPeekAtLastError() );	
-		}
-		
-		subgen.generate(graph, subgraph);
-			
-	}	
-	
-	float runtime = timer.Finish();
-	cout << "Processing finished in " << runtime/1000 << " (s).\n";
-	
-	cout << "Number of iterations = " << itr << endl;
-	
-	gpuErrorcheck(cudaMemcpy(graph.value, graph.d_value, graph.num_nodes*sizeof(uint), cudaMemcpyDeviceToHost));
-	
-	utilities::PrintResults(graph.value, min(30, graph.num_nodes));
-			
-	if(arguments.hasOutput)
-		utilities::SaveResults(arguments.output, graph.value, graph.num_nodes);
+    cudaFree(0);
+
+    ArgumentParser arguments(argc, argv, false, false);
+    Timer timer;
+    timer.Start();
+    Graph<OutEdge> graph(arguments.input, false);
+    graph.ReadGraph();
+    float readtime = timer.Finish();
+    cout << "Graph Reading finished in " << readtime/1000 << " (s).\n";
+
+    for(uint i=0; i < graph.num_nodes; i++)
+    {
+        graph.value[i] = i;
+        graph.label1[i] = false;
+        graph.label2[i] = true;
+    }
+
+    ull n = graph.num_nodes;
+    gpuErrorcheck(cudaMemcpy(graph.d_outDegree, graph.outDegree, n * sizeof(uint), cudaMemcpyHostToDevice));
+    gpuErrorcheck(cudaMemcpy(graph.d_value, graph.value, n * sizeof(uint), cudaMemcpyHostToDevice));
+    gpuErrorcheck(cudaMemcpy(graph.d_label1, graph.label1, n * sizeof(bool), cudaMemcpyHostToDevice));
+    gpuErrorcheck(cudaMemcpy(graph.d_label2, graph.label2, n * sizeof(bool), cudaMemcpyHostToDevice));
+    Subgraph<OutEdge> subgraph(graph.num_nodes, graph.num_edges);
+
+    SubgraphGenerator<OutEdge> subgen(graph);
+
+    subgen.generate(graph, subgraph);
+
+    Partitioner<OutEdge> partitioner;
+    timer.Start();
+    uint itr = 0;
+
+
+    while (subgraph.numActiveNodes>0)
+    {
+        std::cout << "number_of_active_nodes: " << subgraph.numActiveNodes << std::endl;
+        itr++;
+        partitioner.partition(subgraph, subgraph.numActiveNodes);
+
+
+        std::cout << "number of partitions: " << partitioner.numPartitions << std::endl;
+        gpuErrorcheck( cudaPeekAtLastError() );
+
+
+        // a super iteration
+        for(int i=0; i<partitioner.numPartitions; i++)
+        {
+            std::cout << "partition: " << i << std::endl;
+            std::cout << "partition #edges: " << partitioner.partitionEdgeSize[i] << std::endl;
+            std::cout << "partition node size: " << partitioner.partitionNodeSize[i] << std::endl;
+            cudaDeviceSynchronize();
+            gpuErrorcheck(cudaMemcpy(subgraph.d_activeEdgeList, subgraph.activeEdgeList + partitioner.fromEdge[i], (partitioner.partitionEdgeSize[i]) * sizeof(OutEdge), cudaMemcpyHostToDevice));
+            cudaDeviceSynchronize();
+
+            moveUpLabels<<< partitioner.partitionNodeSize[i]/512 + 1 , 512 >>>(subgraph.d_activeNodes, graph.d_label1, graph.d_label2, partitioner.partitionNodeSize[i], partitioner.fromNode[i]);
+            cc_kernel<<< partitioner.partitionNodeSize[i]/512 + 1 , 512 >>>(partitioner.partitionNodeSize[i],
+                                                    partitioner.fromNode[i],
+                                                    partitioner.fromEdge[i],
+                                                    subgraph.d_activeNodes,
+                                                    subgraph.d_activeNodesPointer,
+                                                    subgraph.d_activeEdgeList,
+                                                    graph.d_outDegree,
+                                                    graph.d_value, 
+                                                    //d_finished,
+                                                    graph.d_label1,
+                                                    graph.d_label2);
+
+            cudaDeviceSynchronize();
+            gpuErrorcheck( cudaPeekAtLastError() );
+        }
+        subgen.generate(graph, subgraph);
+    }
+    float runtime = timer.Finish();
+    cout << "Processing finished in " << runtime/1000 << " (s).\n";
+    cout << "Number of iterations = " << itr << endl;
+    gpuErrorcheck(cudaMemcpy(graph.value, graph.d_value, n*sizeof(uint), cudaMemcpyDeviceToHost));
+    utilities::PrintResults(graph.value, min(30, graph.num_nodes));
+    if(arguments.hasOutput)
+        utilities::SaveResults(arguments.output, graph.value, graph.num_nodes);
 }
 
diff --git a/subway/pr-async.cu b/subway/pr-async.cu
index c5fe5e9..672eee5 100644
--- a/subway/pr-async.cu
+++ b/subway/pr-async.cu
@@ -14,112 +14,86 @@
 
 int main(int argc, char** argv)
 {
-	
-	cudaFree(0);
+    cudaFree(0);
 
-	ArgumentParser arguments(argc, argv, true, false);
-	
-	Timer timer;
-	timer.Start();
-	
-	GraphPR<OutEdge> graph(arguments.input, true);
-	graph.ReadGraph();
-	
-	float readtime = timer.Finish();
-	cout << "Graph Reading finished in " << readtime/1000 << " (s).\n";
-	
-	//for(unsigned int i=0; i<100; i++)
-	//	cout << graph.edgeList[i].end << " " << graph.edgeList[i].w8;
-	
-	float initPR = 0.15;
-	float acc = 0.01;
-	
-	for(unsigned int i=0; i<graph.num_nodes; i++)
-	{
-		graph.delta[i] = initPR;
-		graph.value[i] = 0;
-	}
-	//graph.value[arguments.sourceNode] = 0;
-	//graph.label[arguments.sourceNode] = true;
+    ArgumentParser arguments(argc, argv, true, false);
+    Timer timer;
+    timer.Start();
+    GraphPR<OutEdge> graph(arguments.input, true);
+    graph.ReadGraph();
+    float readtime = timer.Finish();
+    cout << "Graph Reading finished in " << readtime/1000 << " (s).\n";
+    //for(unsigned int i=0; i<100; i++)
+    //    cout << graph.edgeList[i].end << " " << graph.edgeList[i].w8;
+    float initPR = 0.15;
+    float acc = 0.01;
+    for(uint i=0; i<graph.num_nodes; i++)
+    {
+        graph.delta[i] = initPR;
+        graph.value[i] = 0;
+    }
+    //graph.value[arguments.sourceNode] = 0;
+    //graph.label[arguments.sourceNode] = true;
 
+    ull n = graph.num_nodes;
+    gpuErrorcheck(cudaMemcpy(graph.d_outDegree, graph.outDegree, n * sizeof(uint), cudaMemcpyHostToDevice));
+    gpuErrorcheck(cudaMemcpy(graph.d_value, graph.value, n * sizeof(float), cudaMemcpyHostToDevice));
+    gpuErrorcheck(cudaMemcpy(graph.d_delta, graph.delta, n * sizeof(float), cudaMemcpyHostToDevice));
+    Subgraph<OutEdge> subgraph(graph.num_nodes, graph.num_edges);
+    SubgraphGenerator<OutEdge> subgen(graph);
+    subgen.generate(graph, subgraph, acc);
 
-	gpuErrorcheck(cudaMemcpy(graph.d_outDegree, graph.outDegree, graph.num_nodes * sizeof(unsigned int), cudaMemcpyHostToDevice));
-	gpuErrorcheck(cudaMemcpy(graph.d_value, graph.value, graph.num_nodes * sizeof(float), cudaMemcpyHostToDevice));
-	gpuErrorcheck(cudaMemcpy(graph.d_delta, graph.delta, graph.num_nodes * sizeof(float), cudaMemcpyHostToDevice));
-	
-	Subgraph<OutEdge> subgraph(graph.num_nodes, graph.num_edges);
-	
-	SubgraphGenerator<OutEdge> subgen(graph);
-	
-	subgen.generate(graph, subgraph, acc);	
+    Partitioner<OutEdge> partitioner;
+    timer.Start();
+    uint gItr = 0;
+    bool finished;
+    bool *d_finished;
+    gpuErrorcheck(cudaMalloc(&d_finished, sizeof(bool)));
+    while (subgraph.numActiveNodes>0)
+    {
+        gItr++;
+        partitioner.partition(subgraph, subgraph.numActiveNodes);
+        // a super iteration
+        for(int i=0; i<partitioner.numPartitions; i++)
+        {
+            cudaDeviceSynchronize();
+            gpuErrorcheck(cudaMemcpy(subgraph.d_activeEdgeList, subgraph.activeEdgeList + partitioner.fromEdge[i], (partitioner.partitionEdgeSize[i]) * sizeof(OutEdge), cudaMemcpyHostToDevice));
+            cudaDeviceSynchronize();
 
-	Partitioner<OutEdge> partitioner;
-	
-	timer.Start();
-	
-	uint gItr = 0;
-	
-	bool finished;
-	bool *d_finished;
-	gpuErrorcheck(cudaMalloc(&d_finished, sizeof(bool)));
-		
-	while (subgraph.numActiveNodes>0)
-	{
-		gItr++;
-		
-		partitioner.partition(subgraph, subgraph.numActiveNodes);
-		// a super iteration
-		for(int i=0; i<partitioner.numPartitions; i++)
-		{
-			cudaDeviceSynchronize();
-			gpuErrorcheck(cudaMemcpy(subgraph.d_activeEdgeList, subgraph.activeEdgeList + partitioner.fromEdge[i], (partitioner.partitionEdgeSize[i]) * sizeof(OutEdge), cudaMemcpyHostToDevice));
-			cudaDeviceSynchronize();
+            //moveUpLabels<<< partitioner.partitionNodeSize[i]/512 + 1 , 512 >>>(subgraph.d_activeNodes, graph.d_label, partitioner.partitionNodeSize[i], partitioner.fromNode[i]);
+            //mixLabels<<<partitioner.partitionNodeSize[i]/512 + 1 , 512>>>(subgraph.d_activeNodes, graph.d_label1, graph.d_label2, partitioner.partitionNodeSize[i], partitioner.fromNode[i]);
+            uint itr = 0;
+            do
+            {
+                itr++;
+                finished = true;
+                gpuErrorcheck(cudaMemcpy(d_finished, &finished, sizeof(bool), cudaMemcpyHostToDevice));
 
-			//moveUpLabels<<< partitioner.partitionNodeSize[i]/512 + 1 , 512 >>>(subgraph.d_activeNodes, graph.d_label, partitioner.partitionNodeSize[i], partitioner.fromNode[i]);
-			//mixLabels<<<partitioner.partitionNodeSize[i]/512 + 1 , 512>>>(subgraph.d_activeNodes, graph.d_label1, graph.d_label2, partitioner.partitionNodeSize[i], partitioner.fromNode[i]);
-			
-			uint itr = 0;
-			do
-			{
-				itr++;
-				finished = true;
-				gpuErrorcheck(cudaMemcpy(d_finished, &finished, sizeof(bool), cudaMemcpyHostToDevice));
+                pr_async<<< partitioner.partitionNodeSize[i]/512 + 1 , 512 >>>(partitioner.partitionNodeSize[i],
+                                                    partitioner.fromNode[i],
+                                                    partitioner.fromEdge[i],
+                                                    subgraph.d_activeNodes,
+                                                    subgraph.d_activeNodesPointer,
+                                                    subgraph.d_activeEdgeList,
+                                                    graph.d_outDegree,
+                                                    graph.d_value,
+                                                    graph.d_delta,
+                                                    d_finished,
+                                                    acc);
 
-				pr_async<<< partitioner.partitionNodeSize[i]/512 + 1 , 512 >>>(partitioner.partitionNodeSize[i],
-													partitioner.fromNode[i],
-													partitioner.fromEdge[i],
-													subgraph.d_activeNodes,
-													subgraph.d_activeNodesPointer,
-													subgraph.d_activeEdgeList,
-													graph.d_outDegree,
-													graph.d_value,
-													graph.d_delta,
-													d_finished,
-													acc);		
-																						
-
-				cudaDeviceSynchronize();
-				gpuErrorcheck( cudaPeekAtLastError() );	
-				
-				gpuErrorcheck(cudaMemcpy(&finished, d_finished, sizeof(bool), cudaMemcpyDeviceToHost));
-			}while(!(finished));
-			
-			cout << itr << ((itr>1) ? " Inner Iterations" : " Inner Iteration") << " in Global Iteration " << gItr << ", Partition " << i  << endl;			
-		}
-		
-		subgen.generate(graph, subgraph, acc);
-			
-	}	
-	
-	float runtime = timer.Finish();
-	cout << "Processing finished in " << runtime/1000 << " (s).\n";
-	
-	gpuErrorcheck(cudaMemcpy(graph.value, graph.d_value, graph.num_nodes*sizeof(float), cudaMemcpyDeviceToHost));
-	
-	utilities::PrintResults(graph.value, min(30, graph.num_nodes));
-	
-		
-	if(arguments.hasOutput)
-		utilities::SaveResults(arguments.output, graph.value, graph.num_nodes);
+                cudaDeviceSynchronize();
+                gpuErrorcheck( cudaPeekAtLastError() );
+                gpuErrorcheck(cudaMemcpy(&finished, d_finished, sizeof(bool), cudaMemcpyDeviceToHost));
+            }while(!(finished));
+            cout << itr << ((itr>1) ? " Inner Iterations" : " Inner Iteration") << " in Global Iteration " << gItr << ", Partition " << i  << endl;
+        }
+        subgen.generate(graph, subgraph, acc);
+    }
+    float runtime = timer.Finish();
+    cout << "Processing finished in " << runtime/1000 << " (s).\n";
+    gpuErrorcheck(cudaMemcpy(graph.value, graph.d_value, n*sizeof(float), cudaMemcpyDeviceToHost));
+    utilities::PrintResults(graph.value, min(30, graph.num_nodes));
+    if(arguments.hasOutput)
+        utilities::SaveResults(arguments.output, graph.value, graph.num_nodes);
 }
 
diff --git a/subway/pr-sync.cu b/subway/pr-sync.cu
index f520663..376bb28 100644
--- a/subway/pr-sync.cu
+++ b/subway/pr-sync.cu
@@ -13,97 +13,75 @@
 
 
 int main(int argc, char** argv)
-{	
-	cudaFree(0);
+{
+    cudaFree(0);
 
-	ArgumentParser arguments(argc, argv, true, false);
-	
-	Timer timer;
-	timer.Start();
-	
-	GraphPR<OutEdge> graph(arguments.input, true);
-	graph.ReadGraph();
-	
-	float readtime = timer.Finish();
-	cout << "Graph Reading finished in " << readtime/1000 << " (s).\n";
-	
-	//for(unsigned int i=0; i<100; i++)
-	//	cout << graph.edgeList[i].end << " " << graph.edgeList[i].w8;
-	
-	float initPR = 0.15;
-	float acc = 0.01;
-	
-	for(unsigned int i=0; i<graph.num_nodes; i++)
-	{
-		graph.delta[i] = initPR;
-		graph.value[i] = 0;
-	}
-	//graph.value[arguments.sourceNode] = 0;
-	//graph.label[arguments.sourceNode] = true;
+    ArgumentParser arguments(argc, argv, true, false);
+    Timer timer;
+    timer.Start();
+    GraphPR<OutEdge> graph(arguments.input, true);
+    graph.ReadGraph();
+    float readtime = timer.Finish();
+    cout << "Graph Reading finished in " << readtime/1000 << " (s).\n";
+    //for(unsigned int i=0; i<100; i++)
+    //    cout << graph.edgeList[i].end << " " << graph.edgeList[i].w8;
+    float initPR = 0.15;
+    float acc = 0.01;
+    for(uint i=0; i<graph.num_nodes; i++)
+    {
+        graph.delta[i] = initPR;
+        graph.value[i] = 0;
+    }
+    //graph.value[arguments.sourceNode] = 0;
+    //graph.label[arguments.sourceNode] = true;
 
 
-	gpuErrorcheck(cudaMemcpy(graph.d_outDegree, graph.outDegree, graph.num_nodes * sizeof(unsigned int), cudaMemcpyHostToDevice));
-	gpuErrorcheck(cudaMemcpy(graph.d_value, graph.value, graph.num_nodes * sizeof(float), cudaMemcpyHostToDevice));
-	gpuErrorcheck(cudaMemcpy(graph.d_delta, graph.delta, graph.num_nodes * sizeof(float), cudaMemcpyHostToDevice));
-	
-	Subgraph<OutEdge> subgraph(graph.num_nodes, graph.num_edges);
-	
-	SubgraphGenerator<OutEdge> subgen(graph);
-	
-	subgen.generate(graph, subgraph, acc);	
+    ull n = graph.num_nodes;
+    gpuErrorcheck(cudaMemcpy(graph.d_outDegree, graph.outDegree, n * sizeof(uint), cudaMemcpyHostToDevice));
+    gpuErrorcheck(cudaMemcpy(graph.d_value, graph.value, n * sizeof(float), cudaMemcpyHostToDevice));
+    gpuErrorcheck(cudaMemcpy(graph.d_delta, graph.delta, n * sizeof(float), cudaMemcpyHostToDevice));
+    Subgraph<OutEdge> subgraph(graph.num_nodes, graph.num_edges);
+    SubgraphGenerator<OutEdge> subgen(graph);
+    subgen.generate(graph, subgraph, acc);
 
-	Partitioner<OutEdge> partitioner;
-	
-	timer.Start();
-	
-	uint gItr = 0;
-	
-		
-	while (subgraph.numActiveNodes>0)
-	{
-		gItr++;
-		
-		partitioner.partition(subgraph, subgraph.numActiveNodes);
-		// a super iteration
-		for(int i=0; i<partitioner.numPartitions; i++)
-		{
-			cudaDeviceSynchronize();
-			gpuErrorcheck(cudaMemcpy(subgraph.d_activeEdgeList, subgraph.activeEdgeList + partitioner.fromEdge[i], (partitioner.partitionEdgeSize[i]) * sizeof(OutEdge), cudaMemcpyHostToDevice));
-			cudaDeviceSynchronize();
-			
+    Partitioner<OutEdge> partitioner;
+    timer.Start();
+    uint gItr = 0;
+    while (subgraph.numActiveNodes>0)
+    {
+        gItr++;
+        partitioner.partition(subgraph, subgraph.numActiveNodes);
+        // a super iteration
+        for(int i=0; i<partitioner.numPartitions; i++)
+        {
+            cudaDeviceSynchronize();
+            gpuErrorcheck(cudaMemcpy(subgraph.d_activeEdgeList, subgraph.activeEdgeList + partitioner.fromEdge[i], (partitioner.partitionEdgeSize[i]) * sizeof(OutEdge), cudaMemcpyHostToDevice));
+            cudaDeviceSynchronize();
 
-			pr_kernel<<< partitioner.partitionNodeSize[i]/512 + 1 , 512 >>>(partitioner.partitionNodeSize[i],
-												partitioner.fromNode[i],
-												partitioner.fromEdge[i],
-												subgraph.d_activeNodes,
-												subgraph.d_activeNodesPointer,
-												subgraph.d_activeEdgeList,
-												graph.d_outDegree,
-												graph.d_value,
-												graph.d_delta,
-												acc);		
+            pr_kernel<<< partitioner.partitionNodeSize[i]/512 + 1 , 512 >>>(partitioner.partitionNodeSize[i],
+                                                partitioner.fromNode[i],
+                                                partitioner.fromEdge[i],
+                                                subgraph.d_activeNodes,
+                                                subgraph.d_activeNodesPointer,
+                                                subgraph.d_activeEdgeList,
+                                                graph.d_outDegree,
+                                                graph.d_value,
+                                                graph.d_delta,
+                                                acc);
 
 
-			cudaDeviceSynchronize();
-			gpuErrorcheck( cudaPeekAtLastError() );	
-	
-		}
-		
-		subgen.generate(graph, subgraph, acc);
-			
-	}	
-	
-	float runtime = timer.Finish();
-	cout << "Processing finished in " << runtime/1000 << " (s).\n";
-	
-	cout << "Number of iterations = " << gItr << endl;
-	
-	gpuErrorcheck(cudaMemcpy(graph.value, graph.d_value, graph.num_nodes*sizeof(float), cudaMemcpyDeviceToHost));
-	
-	utilities::PrintResults(graph.value, min(30, graph.num_nodes));
+            cudaDeviceSynchronize();
+            gpuErrorcheck( cudaPeekAtLastError() );
+        }
+        subgen.generate(graph, subgraph, acc);
+    }
+    float runtime = timer.Finish();
+    cout << "Processing finished in " << runtime/1000 << " (s).\n";
+    cout << "Number of iterations = " << gItr << endl;
+    gpuErrorcheck(cudaMemcpy(graph.value, graph.d_value, n*sizeof(float), cudaMemcpyDeviceToHost));
+    utilities::PrintResults(graph.value, min(30, graph.num_nodes));
 
-			
-	if(arguments.hasOutput)
-		utilities::SaveResults(arguments.output, graph.value, graph.num_nodes);
+    if(arguments.hasOutput)
+        utilities::SaveResults(arguments.output, graph.value, graph.num_nodes);
 }
 
diff --git a/subway/sssp-async.cu b/subway/sssp-async.cu
index 036b5b7..a9a6cc5 100644
--- a/subway/sssp-async.cu
+++ b/subway/sssp-async.cu
@@ -14,124 +14,98 @@
 
 int main(int argc, char** argv)
 {
-	/*
-	Test<int> test;
-	cout << test.sum(20, 30) << endl;
-	*/
-	
-	cudaFree(0);
+    /*
+    Test<int> test;
+    cout << test.sum(20, 30) << endl;
+    */
+    cudaFree(0);
 
-	ArgumentParser arguments(argc, argv, true, false);
-	
-	Timer timer;
-	timer.Start();
-	
-	Graph<OutEdgeWeighted> graph(arguments.input, true);
-	graph.ReadGraph();
-	
-	float readtime = timer.Finish();
-	cout << "Graph Reading finished in " << readtime/1000 << " (s).\n";
-	
-	//for(unsigned int i=0; i<100; i++)
-	//	cout << graph.edgeList[i].end << " " << graph.edgeList[i].w8;
-	
-	for(unsigned int i=0; i<graph.num_nodes; i++)
-	{
-		graph.value[i] = DIST_INFINITY;
-		graph.label1[i] = true;
-		graph.label2[i] = false;
-	}
-	graph.value[arguments.sourceNode] = 0;
-	//graph.label[arguments.sourceNode] = true;
+    ArgumentParser arguments(argc, argv, true, false);
+    Timer timer;
+    timer.Start();
+    Graph<OutEdgeWeighted> graph(arguments.input, true);
+    graph.ReadGraph();
+    float readtime = timer.Finish();
+    cout << "Graph Reading finished in " << readtime/1000 << " (s).\n";
+    //for(unsigned int i=0; i<100; i++)
+    //    cout << graph.edgeList[i].end << " " << graph.edgeList[i].w8;
+    for(uint i=0; i<graph.num_nodes; i++)
+    {
+        graph.value[i] = DIST_INFINITY;
+        graph.label1[i] = true;
+        graph.label2[i] = false;
+    }
+    graph.value[arguments.sourceNode] = 0;
+    //graph.label[arguments.sourceNode] = true;
 
+    ull n = graph.num_nodes;
+    gpuErrorcheck(cudaMemcpy(graph.d_outDegree, graph.outDegree, n * sizeof(uint), cudaMemcpyHostToDevice));
+    gpuErrorcheck(cudaMemcpy(graph.d_value, graph.value, n * sizeof(uint), cudaMemcpyHostToDevice));
+    gpuErrorcheck(cudaMemcpy(graph.d_label1, graph.label1, n * sizeof(bool), cudaMemcpyHostToDevice));
+    gpuErrorcheck(cudaMemcpy(graph.d_label2, graph.label2, n * sizeof(bool), cudaMemcpyHostToDevice));
+    Subgraph<OutEdgeWeighted> subgraph(graph.num_nodes, graph.num_edges);
+    SubgraphGenerator<OutEdgeWeighted> subgen(graph);
+    subgen.generate(graph, subgraph);
+    for(uint i=0; i<graph.num_nodes; i++)
+    {
+        graph.label1[i] = false;
+    }
+    graph.label1[arguments.sourceNode] = true;
+    gpuErrorcheck(cudaMemcpy(graph.d_label1, graph.label1, n * sizeof(bool), cudaMemcpyHostToDevice));
 
-	gpuErrorcheck(cudaMemcpy(graph.d_outDegree, graph.outDegree, graph.num_nodes * sizeof(unsigned int), cudaMemcpyHostToDevice));
-	gpuErrorcheck(cudaMemcpy(graph.d_value, graph.value, graph.num_nodes * sizeof(unsigned int), cudaMemcpyHostToDevice));
-	gpuErrorcheck(cudaMemcpy(graph.d_label1, graph.label1, graph.num_nodes * sizeof(bool), cudaMemcpyHostToDevice));
-	gpuErrorcheck(cudaMemcpy(graph.d_label2, graph.label2, graph.num_nodes * sizeof(bool), cudaMemcpyHostToDevice));
-	
-	Subgraph<OutEdgeWeighted> subgraph(graph.num_nodes, graph.num_edges);
-	
-	SubgraphGenerator<OutEdgeWeighted> subgen(graph);
-	
-	subgen.generate(graph, subgraph);
-	
-	for(unsigned int i=0; i<graph.num_nodes; i++)
-	{
-		graph.label1[i] = false;
-	}
-	graph.label1[arguments.sourceNode] = true;
-	gpuErrorcheck(cudaMemcpy(graph.d_label1, graph.label1, graph.num_nodes * sizeof(bool), cudaMemcpyHostToDevice));	
-	
+    Partitioner<OutEdgeWeighted> partitioner;
+    timer.Start();
+    uint gItr = 0;
+    bool finished;
+    bool *d_finished;
+    gpuErrorcheck(cudaMalloc(&d_finished, sizeof(bool)));
+    while (subgraph.numActiveNodes>0)
+    {
+        gItr++;
+        partitioner.partition(subgraph, subgraph.numActiveNodes);
+        // a super iteration
+        for(int i=0; i<partitioner.numPartitions; i++)
+        {
+            cudaDeviceSynchronize();
+            gpuErrorcheck(cudaMemcpy(subgraph.d_activeEdgeList, subgraph.activeEdgeList + partitioner.fromEdge[i], (partitioner.partitionEdgeSize[i]) * sizeof(OutEdgeWeighted), cudaMemcpyHostToDevice));
+            cudaDeviceSynchronize();
 
-	Partitioner<OutEdgeWeighted> partitioner;
-	
-	timer.Start();
-	
-	uint gItr = 0;
-	
-	bool finished;
-	bool *d_finished;
-	gpuErrorcheck(cudaMalloc(&d_finished, sizeof(bool)));
-		
-	while (subgraph.numActiveNodes>0)
-	{
-		gItr++;
-		
-		partitioner.partition(subgraph, subgraph.numActiveNodes);
-		// a super iteration
-		for(int i=0; i<partitioner.numPartitions; i++)
-		{
-			cudaDeviceSynchronize();
-			gpuErrorcheck(cudaMemcpy(subgraph.d_activeEdgeList, subgraph.activeEdgeList + partitioner.fromEdge[i], (partitioner.partitionEdgeSize[i]) * sizeof(OutEdgeWeighted), cudaMemcpyHostToDevice));
-			cudaDeviceSynchronize();
+            //moveUpLabels<<< partitioner.partitionNodeSize[i]/512 + 1 , 512 >>>(subgraph.d_activeNodes, graph.d_label, partitioner.partitionNodeSize[i], partitioner.fromNode[i]);
+            mixLabels<<<partitioner.partitionNodeSize[i]/512 + 1 , 512>>>(subgraph.d_activeNodes, graph.d_label1, graph.d_label2, partitioner.partitionNodeSize[i], partitioner.fromNode[i]);
+            uint itr = 0;
+            do
+            {
+                itr++;
+                finished = true;
+                gpuErrorcheck(cudaMemcpy(d_finished, &finished, sizeof(bool), cudaMemcpyHostToDevice));
 
-			//moveUpLabels<<< partitioner.partitionNodeSize[i]/512 + 1 , 512 >>>(subgraph.d_activeNodes, graph.d_label, partitioner.partitionNodeSize[i], partitioner.fromNode[i]);
-			mixLabels<<<partitioner.partitionNodeSize[i]/512 + 1 , 512>>>(subgraph.d_activeNodes, graph.d_label1, graph.d_label2, partitioner.partitionNodeSize[i], partitioner.fromNode[i]);
-			
-			uint itr = 0;
-			do
-			{
-				itr++;
-				finished = true;
-				gpuErrorcheck(cudaMemcpy(d_finished, &finished, sizeof(bool), cudaMemcpyHostToDevice));
+                sssp_async<<< partitioner.partitionNodeSize[i]/512 + 1 , 512 >>>(partitioner.partitionNodeSize[i],
+                                                    partitioner.fromNode[i],
+                                                    partitioner.fromEdge[i],
+                                                    subgraph.d_activeNodes,
+                                                    subgraph.d_activeNodesPointer,
+                                                    subgraph.d_activeEdgeList,
+                                                    graph.d_outDegree,
+                                                    graph.d_value, 
+                                                    d_finished,
+                                                    (itr%2==1) ? graph.d_label1 : graph.d_label2,
+                                                    (itr%2==1) ? graph.d_label2 : graph.d_label1);
 
-				sssp_async<<< partitioner.partitionNodeSize[i]/512 + 1 , 512 >>>(partitioner.partitionNodeSize[i],
-													partitioner.fromNode[i],
-													partitioner.fromEdge[i],
-													subgraph.d_activeNodes,
-													subgraph.d_activeNodesPointer,
-													subgraph.d_activeEdgeList,
-													graph.d_outDegree,
-													graph.d_value, 
-													d_finished,
-													(itr%2==1) ? graph.d_label1 : graph.d_label2,
-													(itr%2==1) ? graph.d_label2 : graph.d_label1);													
-
-				cudaDeviceSynchronize();
-				gpuErrorcheck( cudaPeekAtLastError() );	
-				
-				gpuErrorcheck(cudaMemcpy(&finished, d_finished, sizeof(bool), cudaMemcpyDeviceToHost));
-			}while(!(finished));
-			
-			cout << itr << ((itr>1) ? " Inner Iterations" : " Inner Iteration") << " in Global Iteration " << gItr << ", Partition " << i  << endl;			
-		}
-		
-		subgen.generate(graph, subgraph);
-			
-	}	
-	
-	float runtime = timer.Finish();
-	cout << "Processing finished in " << runtime/1000 << " (s).\n";
-	
-	gpuErrorcheck(cudaMemcpy(graph.value, graph.d_value, graph.num_nodes*sizeof(uint), cudaMemcpyDeviceToHost));
-	
-	utilities::PrintResults(graph.value, min(30, graph.num_nodes));
-		
-	//for(int i=0; i<20; i++)
-	//	cout << graph.value[i] << endl;
-			
-	if(arguments.hasOutput)
-		utilities::SaveResults(arguments.output, graph.value, graph.num_nodes);
+                cudaDeviceSynchronize();
+                gpuErrorcheck( cudaPeekAtLastError() );
+                gpuErrorcheck(cudaMemcpy(&finished, d_finished, sizeof(bool), cudaMemcpyDeviceToHost));
+            }while(!(finished));
+            cout << itr << ((itr>1) ? " Inner Iterations" : " Inner Iteration") << " in Global Iteration " << gItr << ", Partition " << i  << endl;
+        }
+        subgen.generate(graph, subgraph);
+    }
+    float runtime = timer.Finish();
+    cout << "Processing finished in " << runtime/1000 << " (s).\n";
+    gpuErrorcheck(cudaMemcpy(graph.value, graph.d_value, n*sizeof(uint), cudaMemcpyDeviceToHost));
+    utilities::PrintResults(graph.value, min(30, graph.num_nodes));
+    //for(int i=0; i<20; i++)
+    //    cout << graph.value[i] << endl;
+    if(arguments.hasOutput)
+        utilities::SaveResults(arguments.output, graph.value, graph.num_nodes);
 }
 
diff --git a/subway/sssp-sync.cu b/subway/sssp-sync.cu
index e4a7440..892a963 100644
--- a/subway/sssp-sync.cu
+++ b/subway/sssp-sync.cu
@@ -12,92 +12,75 @@
 
 int main(int argc, char** argv)
 {
-	cudaFree(0);
+    cudaFree(0);
 
-	ArgumentParser arguments(argc, argv, true, false);
-	
-	Timer timer;
-	timer.Start();
-	
-	Graph<OutEdgeWeighted> graph(arguments.input, true);
-	graph.ReadGraph();
-	
-	float readtime = timer.Finish();
-	cout << "Graph Reading finished in " << readtime/1000 << " (s).\n";
-	
-	for(unsigned int i=0; i<graph.num_nodes; i++)
-	{
-		graph.value[i] = DIST_INFINITY;
-		graph.label1[i] = false;
-		graph.label2[i] = false;
-	}
-	graph.value[arguments.sourceNode] = 0;
-	graph.label1[arguments.sourceNode] = false;
-	graph.label2[arguments.sourceNode] = true;
+    ArgumentParser arguments(argc, argv, true, false);
+    Timer timer;
+    timer.Start();
+    Graph<OutEdgeWeighted> graph(arguments.input, true);
+    graph.ReadGraph();
+    float readtime = timer.Finish();
+    cout << "Graph Reading finished in " << readtime/1000 << " (s).\n";
+    for(uint i=0; i<graph.num_nodes; i++)
+    {
+        graph.value[i] = DIST_INFINITY;
+        graph.label1[i] = false;
+        graph.label2[i] = false;
+    }
+    graph.value[arguments.sourceNode] = 0;
+    graph.label1[arguments.sourceNode] = false;
+    graph.label2[arguments.sourceNode] = true;
 
 
-	gpuErrorcheck(cudaMemcpy(graph.d_outDegree, graph.outDegree, graph.num_nodes * sizeof(unsigned int), cudaMemcpyHostToDevice));
-	gpuErrorcheck(cudaMemcpy(graph.d_value, graph.value, graph.num_nodes * sizeof(unsigned int), cudaMemcpyHostToDevice));
-	gpuErrorcheck(cudaMemcpy(graph.d_label1, graph.label1, graph.num_nodes * sizeof(bool), cudaMemcpyHostToDevice));
-	gpuErrorcheck(cudaMemcpy(graph.d_label2, graph.label2, graph.num_nodes * sizeof(bool), cudaMemcpyHostToDevice));
-	
-	Subgraph<OutEdgeWeighted> subgraph(graph.num_nodes, graph.num_edges);
-	
-	SubgraphGenerator<OutEdgeWeighted> subgen(graph);
-	
-	subgen.generate(graph, subgraph);
+    ull n = graph.num_nodes;
+    gpuErrorcheck(cudaMemcpy(graph.d_outDegree, graph.outDegree, n * sizeof(uint), cudaMemcpyHostToDevice));
+    gpuErrorcheck(cudaMemcpy(graph.d_value, graph.value, n * sizeof(uint), cudaMemcpyHostToDevice));
+    gpuErrorcheck(cudaMemcpy(graph.d_label1, graph.label1, n * sizeof(bool), cudaMemcpyHostToDevice));
+    gpuErrorcheck(cudaMemcpy(graph.d_label2, graph.label2, n * sizeof(bool), cudaMemcpyHostToDevice));
+    Subgraph<OutEdgeWeighted> subgraph(graph.num_nodes, graph.num_edges);
+    SubgraphGenerator<OutEdgeWeighted> subgen(graph);
+    subgen.generate(graph, subgraph);
 
 
-	Partitioner<OutEdgeWeighted> partitioner;
-	
-	timer.Start();
-	
-	uint itr = 0;
-		
-	while (subgraph.numActiveNodes>0)
-	{
-		itr++;
-		
-		partitioner.partition(subgraph, subgraph.numActiveNodes);
-		// a super iteration
-		for(int i=0; i<partitioner.numPartitions; i++)
-		{
-			cudaDeviceSynchronize();
-			gpuErrorcheck(cudaMemcpy(subgraph.d_activeEdgeList, subgraph.activeEdgeList + partitioner.fromEdge[i], (partitioner.partitionEdgeSize[i]) * sizeof(OutEdgeWeighted), cudaMemcpyHostToDevice));
-			cudaDeviceSynchronize();
+    Partitioner<OutEdgeWeighted> partitioner;
+    timer.Start();
+    uint itr = 0;
+    while (subgraph.numActiveNodes>0)
+    {
+        itr++;
+        partitioner.partition(subgraph, subgraph.numActiveNodes);
+        // a super iteration
+        for(int i=0; i<partitioner.numPartitions; i++)
+        {
+            cudaDeviceSynchronize();
+            gpuErrorcheck(cudaMemcpy(subgraph.d_activeEdgeList, subgraph.activeEdgeList + partitioner.fromEdge[i], (partitioner.partitionEdgeSize[i]) * sizeof(OutEdgeWeighted), cudaMemcpyHostToDevice));
+            cudaDeviceSynchronize();
 
-			moveUpLabels<<< partitioner.partitionNodeSize[i]/512 + 1 , 512 >>>(subgraph.d_activeNodes, graph.d_label1, graph.d_label2, partitioner.partitionNodeSize[i], partitioner.fromNode[i]);
+            moveUpLabels<<< partitioner.partitionNodeSize[i]/512 + 1 , 512 >>>(subgraph.d_activeNodes, graph.d_label1, graph.d_label2, partitioner.partitionNodeSize[i], partitioner.fromNode[i]);
 
-			sssp_kernel<<< partitioner.partitionNodeSize[i]/512 + 1 , 512 >>>(partitioner.partitionNodeSize[i],
-													partitioner.fromNode[i],
-													partitioner.fromEdge[i],
-													subgraph.d_activeNodes,
-													subgraph.d_activeNodesPointer,
-													subgraph.d_activeEdgeList,
-													graph.d_outDegree,
-													graph.d_value, 
-													//d_finished,
-													graph.d_label1,
-													graph.d_label2);
+            sssp_kernel<<< partitioner.partitionNodeSize[i]/512 + 1 , 512 >>>(partitioner.partitionNodeSize[i],
+                                                    partitioner.fromNode[i],
+                                                    partitioner.fromEdge[i],
+                                                    subgraph.d_activeNodes,
+                                                    subgraph.d_activeNodesPointer,
+                                                    subgraph.d_activeEdgeList,
+                                                    graph.d_outDegree,
+                                                    graph.d_value, 
+                                                    //d_finished,
+                                                    graph.d_label1,
+                                                    graph.d_label2);
 
-			cudaDeviceSynchronize();
-			gpuErrorcheck( cudaPeekAtLastError() );	
-		}
-		
-		subgen.generate(graph, subgraph);
-			
-	}	
-	
-	float runtime = timer.Finish();
-	cout << "Processing finished in " << runtime << " (ms).\n";
-	
-	cout << "Number of iterations = " << itr << endl;
-	
-	gpuErrorcheck(cudaMemcpy(graph.value, graph.d_value, graph.num_nodes*sizeof(uint), cudaMemcpyDeviceToHost));
-	
-	utilities::PrintResults(graph.value, min(30, graph.num_nodes));
-			
-	if(arguments.hasOutput)
-		utilities::SaveResults(arguments.output, graph.value, graph.num_nodes);
+            cudaDeviceSynchronize();
+            gpuErrorcheck( cudaPeekAtLastError() );
+        }
+        subgen.generate(graph, subgraph);
+    }
+    float runtime = timer.Finish();
+    cout << "Processing finished in " << runtime << " (ms).\n";
+    cout << "Number of iterations = " << itr << endl;
+    gpuErrorcheck(cudaMemcpy(graph.value, graph.d_value, n*sizeof(uint), cudaMemcpyDeviceToHost));
+    utilities::PrintResults(graph.value, min(30, graph.num_nodes));
+    if(arguments.hasOutput)
+        utilities::SaveResults(arguments.output, graph.value, graph.num_nodes);
 }
 
diff --git a/subway/sswp-async.cu b/subway/sswp-async.cu
index 9b8e230..6606d93 100644
--- a/subway/sswp-async.cu
+++ b/subway/sswp-async.cu
@@ -12,116 +12,92 @@
 
 int main(int argc, char** argv)
 {
-	cudaFree(0);
+    cudaFree(0);
 
-	ArgumentParser arguments(argc, argv, true, false);
-	
-	Timer timer;
-	timer.Start();
-	
-	Graph<OutEdgeWeighted> graph(arguments.input, true);
-	graph.ReadGraph();
-	
-	float readtime = timer.Finish();
-	cout << "Graph Reading finished in " << readtime/1000 << " (s).\n";
-	
-	//for(unsigned int i=0; i<100; i++)
-	//	cout << graph.edgeList[i].end << " " << graph.edgeList[i].w8;
-	
-	for(unsigned int i=0; i<graph.num_nodes; i++)
-	{
-		graph.value[i] = 0;
-		graph.label1[i] = true;
-		graph.label2[i] = false;
-	}
-	graph.value[arguments.sourceNode] = DIST_INFINITY;
-	//graph.label[arguments.sourceNode] = true;
+    ArgumentParser arguments(argc, argv, true, false);
+    Timer timer;
+    timer.Start();
+    Graph<OutEdgeWeighted> graph(arguments.input, true);
+    graph.ReadGraph();
+    float readtime = timer.Finish();
+    cout << "Graph Reading finished in " << readtime/1000 << " (s).\n";
+    //for(unsigned int i=0; i<100; i++)
+    //    cout << graph.edgeList[i].end << " " << graph.edgeList[i].w8;
+    for(uint i=0; i<graph.num_nodes; i++)
+    {
+        graph.value[i] = 0;
+        graph.label1[i] = true;
+        graph.label2[i] = false;
+    }
+    graph.value[arguments.sourceNode] = DIST_INFINITY;
+    //graph.label[arguments.sourceNode] = true;
 
+    ull n = graph.num_nodes;
+    gpuErrorcheck(cudaMemcpy(graph.d_outDegree, graph.outDegree, n * sizeof(uint), cudaMemcpyHostToDevice));
+    gpuErrorcheck(cudaMemcpy(graph.d_value, graph.value, n * sizeof(uint), cudaMemcpyHostToDevice));
+    gpuErrorcheck(cudaMemcpy(graph.d_label1, graph.label1, n * sizeof(bool), cudaMemcpyHostToDevice));
+    gpuErrorcheck(cudaMemcpy(graph.d_label2, graph.label2, n * sizeof(bool), cudaMemcpyHostToDevice));
+    Subgraph<OutEdgeWeighted> subgraph(graph.num_nodes, graph.num_edges);
+    SubgraphGenerator<OutEdgeWeighted> subgen(graph);
+    subgen.generate(graph, subgraph);
+    for(uint i=0; i<graph.num_nodes; i++)
+    {
+        graph.label1[i] = false;
+    }
+    graph.label1[arguments.sourceNode] = true;
+    gpuErrorcheck(cudaMemcpy(graph.d_label1, graph.label1, n * sizeof(bool), cudaMemcpyHostToDevice));
 
-	gpuErrorcheck(cudaMemcpy(graph.d_outDegree, graph.outDegree, graph.num_nodes * sizeof(unsigned int), cudaMemcpyHostToDevice));
-	gpuErrorcheck(cudaMemcpy(graph.d_value, graph.value, graph.num_nodes * sizeof(unsigned int), cudaMemcpyHostToDevice));
-	gpuErrorcheck(cudaMemcpy(graph.d_label1, graph.label1, graph.num_nodes * sizeof(bool), cudaMemcpyHostToDevice));
-	gpuErrorcheck(cudaMemcpy(graph.d_label2, graph.label2, graph.num_nodes * sizeof(bool), cudaMemcpyHostToDevice));
-	
-	Subgraph<OutEdgeWeighted> subgraph(graph.num_nodes, graph.num_edges);
-	
-	SubgraphGenerator<OutEdgeWeighted> subgen(graph);
-	
-	subgen.generate(graph, subgraph);
-	
-	for(unsigned int i=0; i<graph.num_nodes; i++)
-	{
-		graph.label1[i] = false;
-	}
-	graph.label1[arguments.sourceNode] = true;
-	gpuErrorcheck(cudaMemcpy(graph.d_label1, graph.label1, graph.num_nodes * sizeof(bool), cudaMemcpyHostToDevice));	
-	
+    Partitioner<OutEdgeWeighted> partitioner;
+    timer.Start();
+    uint gItr = 0;
+    bool finished;
+    bool *d_finished;
+    gpuErrorcheck(cudaMalloc(&d_finished, sizeof(bool)));
+    while (subgraph.numActiveNodes>0)
+    {
+        gItr++;
+        partitioner.partition(subgraph, subgraph.numActiveNodes);
+        // a super iteration
+        for(int i=0; i<partitioner.numPartitions; i++)
+        {
+            cudaDeviceSynchronize();
+            gpuErrorcheck(cudaMemcpy(subgraph.d_activeEdgeList, subgraph.activeEdgeList + partitioner.fromEdge[i], (partitioner.partitionEdgeSize[i]) * sizeof(OutEdgeWeighted), cudaMemcpyHostToDevice));
+            cudaDeviceSynchronize();
 
-	Partitioner<OutEdgeWeighted> partitioner;
-	
-	timer.Start();
-	
-	uint gItr = 0;
-	
-	bool finished;
-	bool *d_finished;
-	gpuErrorcheck(cudaMalloc(&d_finished, sizeof(bool)));
-		
-	while (subgraph.numActiveNodes>0)
-	{
-		gItr++;
-		
-		partitioner.partition(subgraph, subgraph.numActiveNodes);
-		// a super iteration
-		for(int i=0; i<partitioner.numPartitions; i++)
-		{
-			cudaDeviceSynchronize();
-			gpuErrorcheck(cudaMemcpy(subgraph.d_activeEdgeList, subgraph.activeEdgeList + partitioner.fromEdge[i], (partitioner.partitionEdgeSize[i]) * sizeof(OutEdgeWeighted), cudaMemcpyHostToDevice));
-			cudaDeviceSynchronize();
+            //moveUpLabels<<< partitioner.partitionNodeSize[i]/512 + 1 , 512 >>>(subgraph.d_activeNodes, graph.d_label, partitioner.partitionNodeSize[i], partitioner.fromNode[i]);
+            mixLabels<<<partitioner.partitionNodeSize[i]/512 + 1 , 512>>>(subgraph.d_activeNodes, graph.d_label1, graph.d_label2, partitioner.partitionNodeSize[i], partitioner.fromNode[i]);
+            uint itr = 0;
+            do
+            {
+                cout << "\t\tIteration " << ++itr << endl;
+                finished = true;
+                gpuErrorcheck(cudaMemcpy(d_finished, &finished, sizeof(bool), cudaMemcpyHostToDevice));
 
-			//moveUpLabels<<< partitioner.partitionNodeSize[i]/512 + 1 , 512 >>>(subgraph.d_activeNodes, graph.d_label, partitioner.partitionNodeSize[i], partitioner.fromNode[i]);
-			mixLabels<<<partitioner.partitionNodeSize[i]/512 + 1 , 512>>>(subgraph.d_activeNodes, graph.d_label1, graph.d_label2, partitioner.partitionNodeSize[i], partitioner.fromNode[i]);
-			
-			uint itr = 0;
-			do
-			{
-				cout << "\t\tIteration " << ++itr << endl;
-				finished = true;
-				gpuErrorcheck(cudaMemcpy(d_finished, &finished, sizeof(bool), cudaMemcpyHostToDevice));
+                sswp_async<<< partitioner.partitionNodeSize[i]/512 + 1 , 512 >>>(partitioner.partitionNodeSize[i],
+                                                    partitioner.fromNode[i],
+                                                    partitioner.fromEdge[i],
+                                                    subgraph.d_activeNodes,
+                                                    subgraph.d_activeNodesPointer,
+                                                    subgraph.d_activeEdgeList,
+                                                    graph.d_outDegree,
+                                                    graph.d_value, 
+                                                    d_finished,
+                                                    (itr%2==1) ? graph.d_label1 : graph.d_label2,
+                                                    (itr%2==1) ? graph.d_label2 : graph.d_label1);    
 
-				sswp_async<<< partitioner.partitionNodeSize[i]/512 + 1 , 512 >>>(partitioner.partitionNodeSize[i],
-													partitioner.fromNode[i],
-													partitioner.fromEdge[i],
-													subgraph.d_activeNodes,
-													subgraph.d_activeNodesPointer,
-													subgraph.d_activeEdgeList,
-													graph.d_outDegree,
-													graph.d_value, 
-													d_finished,
-													(itr%2==1) ? graph.d_label1 : graph.d_label2,
-													(itr%2==1) ? graph.d_label2 : graph.d_label1);	
-
-				cudaDeviceSynchronize();
-				gpuErrorcheck( cudaPeekAtLastError() );	
-				
-				gpuErrorcheck(cudaMemcpy(&finished, d_finished, sizeof(bool), cudaMemcpyDeviceToHost));
-			}while(!(finished));
-			
-			cout << itr << ((itr>1) ? " Inner Iterations" : " Inner Iteration") << " in Global Iteration " << gItr << ", Partition " << i  << endl;			
-		}
-		
-		subgen.generate(graph, subgraph);
-			
-	}	
-	
-	float runtime = timer.Finish();
-	cout << "Processing finished in " << runtime/1000 << " (s).\n";
-	
-	gpuErrorcheck(cudaMemcpy(graph.value, graph.d_value, graph.num_nodes*sizeof(uint), cudaMemcpyDeviceToHost));
-	
-	utilities::PrintResults(graph.value, min(30, graph.num_nodes));
-			
-	if(arguments.hasOutput)
-		utilities::SaveResults(arguments.output, graph.value, graph.num_nodes);
+                cudaDeviceSynchronize();
+                gpuErrorcheck( cudaPeekAtLastError() );
+                gpuErrorcheck(cudaMemcpy(&finished, d_finished, sizeof(bool), cudaMemcpyDeviceToHost));
+            }while(!(finished));
+            cout << itr << ((itr>1) ? " Inner Iterations" : " Inner Iteration") << " in Global Iteration " << gItr << ", Partition " << i  << endl;
+        }
+        subgen.generate(graph, subgraph);
+    }
+    float runtime = timer.Finish();
+    cout << "Processing finished in " << runtime/1000 << " (s).\n";
+    gpuErrorcheck(cudaMemcpy(graph.value, graph.d_value, n*sizeof(uint), cudaMemcpyDeviceToHost));
+    utilities::PrintResults(graph.value, min(30, graph.num_nodes));
+    if(arguments.hasOutput)
+        utilities::SaveResults(arguments.output, graph.value, graph.num_nodes);
 }
 
diff --git a/subway/sswp-sync.cu b/subway/sswp-sync.cu
index 32cea62..378ad72 100644
--- a/subway/sswp-sync.cu
+++ b/subway/sswp-sync.cu
@@ -11,93 +11,75 @@
 
 
 int main(int argc, char** argv)
-{	
-	cudaFree(0);
+{
+    cudaFree(0);
 
-	ArgumentParser arguments(argc, argv, true, false);
-	
-	Timer timer;
-	timer.Start();
-	
-	Graph<OutEdgeWeighted> graph(arguments.input, true);
-	graph.ReadGraph();
-	
-	float readtime = timer.Finish();
-	cout << "Graph Reading finished in " << readtime/1000 << " (s).\n";
-	
-	for(unsigned int i=0; i<graph.num_nodes; i++)
-	{
-		graph.value[i] = 0;
-		graph.label1[i] = false;
-		graph.label2[i] = false;
-	}
-	graph.value[arguments.sourceNode] = DIST_INFINITY;
-	graph.label1[arguments.sourceNode] = false;
-	graph.label2[arguments.sourceNode] = true;
+    ArgumentParser arguments(argc, argv, true, false);
+    Timer timer;
+    timer.Start();
+    Graph<OutEdgeWeighted> graph(arguments.input, true);
+    graph.ReadGraph();
+    float readtime = timer.Finish();
+    cout << "Graph Reading finished in " << readtime/1000 << " (s).\n";
+    for(uint i=0; i<graph.num_nodes; i++)
+    {
+        graph.value[i] = 0;
+        graph.label1[i] = false;
+        graph.label2[i] = false;
+    }
+    graph.value[arguments.sourceNode] = DIST_INFINITY;
+    graph.label1[arguments.sourceNode] = false;
+    graph.label2[arguments.sourceNode] = true;
 
+    ull n = graph.num_nodes;
+    gpuErrorcheck(cudaMemcpy(graph.d_outDegree, graph.outDegree, n * sizeof(uint), cudaMemcpyHostToDevice));
+    gpuErrorcheck(cudaMemcpy(graph.d_value, graph.value, n * sizeof(uint), cudaMemcpyHostToDevice));
+    gpuErrorcheck(cudaMemcpy(graph.d_label1, graph.label1, n * sizeof(bool), cudaMemcpyHostToDevice));
+    gpuErrorcheck(cudaMemcpy(graph.d_label2, graph.label2, n * sizeof(bool), cudaMemcpyHostToDevice));
+    Subgraph<OutEdgeWeighted> subgraph(graph.num_nodes, graph.num_edges);
+    SubgraphGenerator<OutEdgeWeighted> subgen(graph);
+    subgen.generate(graph, subgraph);
 
-	gpuErrorcheck(cudaMemcpy(graph.d_outDegree, graph.outDegree, graph.num_nodes * sizeof(unsigned int), cudaMemcpyHostToDevice));
-	gpuErrorcheck(cudaMemcpy(graph.d_value, graph.value, graph.num_nodes * sizeof(unsigned int), cudaMemcpyHostToDevice));
-	gpuErrorcheck(cudaMemcpy(graph.d_label1, graph.label1, graph.num_nodes * sizeof(bool), cudaMemcpyHostToDevice));
-	gpuErrorcheck(cudaMemcpy(graph.d_label2, graph.label2, graph.num_nodes * sizeof(bool), cudaMemcpyHostToDevice));
-	
-	Subgraph<OutEdgeWeighted> subgraph(graph.num_nodes, graph.num_edges);
-	
-	SubgraphGenerator<OutEdgeWeighted> subgen(graph);
-	
-	subgen.generate(graph, subgraph);
 
+    Partitioner<OutEdgeWeighted> partitioner;
+    timer.Start();
+    uint itr = 0;
+    while (subgraph.numActiveNodes>0)
+    {
+        itr++;
+        partitioner.partition(subgraph, subgraph.numActiveNodes);
+        // a super iteration
+        for(int i=0; i<partitioner.numPartitions; i++)
+        {
+            cudaDeviceSynchronize();
+            gpuErrorcheck(cudaMemcpy(subgraph.d_activeEdgeList, subgraph.activeEdgeList + partitioner.fromEdge[i], (partitioner.partitionEdgeSize[i]) * sizeof(OutEdgeWeighted), cudaMemcpyHostToDevice));
+            cudaDeviceSynchronize();
 
-	Partitioner<OutEdgeWeighted> partitioner;
-	
-	timer.Start();
-	
-	uint itr = 0;
-		
-	while (subgraph.numActiveNodes>0)
-	{
-		itr++;
-		
-		partitioner.partition(subgraph, subgraph.numActiveNodes);
-		// a super iteration
-		for(int i=0; i<partitioner.numPartitions; i++)
-		{
-			cudaDeviceSynchronize();
-			gpuErrorcheck(cudaMemcpy(subgraph.d_activeEdgeList, subgraph.activeEdgeList + partitioner.fromEdge[i], (partitioner.partitionEdgeSize[i]) * sizeof(OutEdgeWeighted), cudaMemcpyHostToDevice));
-			cudaDeviceSynchronize();
+            moveUpLabels<<< partitioner.partitionNodeSize[i]/512 + 1 , 512 >>>(subgraph.d_activeNodes, graph.d_label1, graph.d_label2, partitioner.partitionNodeSize[i], partitioner.fromNode[i]);
 
-			moveUpLabels<<< partitioner.partitionNodeSize[i]/512 + 1 , 512 >>>(subgraph.d_activeNodes, graph.d_label1, graph.d_label2, partitioner.partitionNodeSize[i], partitioner.fromNode[i]);
+            sswp_kernel<<< partitioner.partitionNodeSize[i]/512 + 1 , 512 >>>(partitioner.partitionNodeSize[i],
+                                                    partitioner.fromNode[i],
+                                                    partitioner.fromEdge[i],
+                                                    subgraph.d_activeNodes,
+                                                    subgraph.d_activeNodesPointer,
+                                                    subgraph.d_activeEdgeList,
+                                                    graph.d_outDegree,
+                                                    graph.d_value, 
+                                                    //d_finished,
+                                                    graph.d_label1,
+                                                    graph.d_label2);
 
-			sswp_kernel<<< partitioner.partitionNodeSize[i]/512 + 1 , 512 >>>(partitioner.partitionNodeSize[i],
-													partitioner.fromNode[i],
-													partitioner.fromEdge[i],
-													subgraph.d_activeNodes,
-													subgraph.d_activeNodesPointer,
-													subgraph.d_activeEdgeList,
-													graph.d_outDegree,
-													graph.d_value, 
-													//d_finished,
-													graph.d_label1,
-													graph.d_label2);
-
-			cudaDeviceSynchronize();
-			gpuErrorcheck( cudaPeekAtLastError() );	
-		}
-		
-		subgen.generate(graph, subgraph);
-			
-	}	
-	
-	float runtime = timer.Finish();
-	cout << "Processing finished in " << runtime/1000 << " (s).\n";
-	
-	cout << "Number of iterations = " << itr << endl;
-	
-	gpuErrorcheck(cudaMemcpy(graph.value, graph.d_value, graph.num_nodes*sizeof(uint), cudaMemcpyDeviceToHost));
-	
-	utilities::PrintResults(graph.value, min(30, graph.num_nodes));
-			
-	if(arguments.hasOutput)
-		utilities::SaveResults(arguments.output, graph.value, graph.num_nodes);
+            cudaDeviceSynchronize();
+            gpuErrorcheck( cudaPeekAtLastError() );
+        }
+        subgen.generate(graph, subgraph);
+    }
+    float runtime = timer.Finish();
+    cout << "Processing finished in " << runtime/1000 << " (s).\n";
+    cout << "Number of iterations = " << itr << endl;
+    gpuErrorcheck(cudaMemcpy(graph.value, graph.d_value, n*sizeof(uint), cudaMemcpyDeviceToHost));
+    utilities::PrintResults(graph.value, min(30, graph.num_nodes));
+    if(arguments.hasOutput)
+        utilities::SaveResults(arguments.output, graph.value, graph.num_nodes);
 }
 
diff --git a/tools/Makefile b/tools/Makefile
index 5a06f9e..24ea489 100644
--- a/tools/Makefile
+++ b/tools/Makefile
@@ -1,15 +1,16 @@
 
 CC=g++
 NC=nvcc
-CFLAGS=-std=c++11 -O3
-NFLAGS=-arch=sm_60
+CFLAGS=-std=c++14 -O3
+
+#NFLAGS=-arch=sm_80
 
 SHARED=../shared
 
-all: converter
+all: converter_stdin
 
-converter: converter.cpp
-	$(CC) converter.cpp -o converter $(CFLAGS)
+converter_stdin: converter_stdin.cpp
+	$(CC) converter_stdin.cpp -o converter_stdin $(CFLAGS)
 			
 clean:
-	rm -f converter
+	rm -f converter_stdin
diff --git a/tools/converter.cpp b/tools/converter.cpp
deleted file mode 100644
index 68365ac..0000000
--- a/tools/converter.cpp
+++ /dev/null
@@ -1,172 +0,0 @@
-#include "../shared/globals.hpp"
-
-
-bool IsWeightedFormat(string format)
-{
-	if((format == "bwcsr")	||
-		(format == "wcsr")	||
-		(format == "wel"))
-			return true;
-	return false;
-}
-
-string GetFileExtension(string fileName)
-{
-    if(fileName.find_last_of(".") != string::npos)
-        return fileName.substr(fileName.find_last_of(".")+1);
-    return "";
-}
-
-int main(int argc, char** argv)
-{
-	if(argc!= 2)
-	{
-		cout << "\nThere was an error parsing command line arguments\n";
-		exit(0);
-	}
-	
-	string input = string(argv[1]);
-	
-	if(GetFileExtension(input) == "el")
-	{
-		ifstream infile;
-		infile.open(input);
-		stringstream ss;
-		uint max = 0;
-		string line;
-		uint edgeCounter = 0;
-		
-		vector<Edge> edges;
-		Edge newEdge;
-		while(getline( infile, line ))
-		{
-			ss.str("");
-			ss.clear();
-			ss << line;
-			
-			ss >> newEdge.source;
-			ss >> newEdge.end;
-			
-			edges.push_back(newEdge);
-			edgeCounter++;
-			
-			if(max < newEdge.source)
-				max = newEdge.source;
-			if(max < newEdge.end)
-				max = newEdge.end;				
-		}			
-		infile.close();
-		
-		uint num_nodes = max + 1;
-		uint num_edges = edgeCounter;
-		uint *nodePointer = new uint[num_nodes+1];
-		OutEdge *edgeList = new OutEdge[num_edges];
-		uint *degree = new uint[num_nodes];
-		for(uint i=0; i<num_nodes; i++)
-			degree[i] = 0;
-		for(uint i=0; i<num_edges; i++)
-			degree[edges[i].source]++;
-		
-		uint counter=0;
-		for(uint i=0; i<num_nodes; i++)
-		{
-			nodePointer[i] = counter;
-			counter = counter + degree[i];
-		}
-		uint *outDegreeCounter  = new uint[num_nodes];
-		uint location;  
-		for(uint i=0; i<num_edges; i++)
-		{
-			uint location = nodePointer[edges[i].source] + outDegreeCounter[edges[i].source];
-			edgeList[location].end = edges[i].end;
-			outDegreeCounter[edges[i].source]++;  
-		}
-		edges.clear();
-		delete[] degree;
-		delete[] outDegreeCounter;
-		
-		std::ofstream outfile(input.substr(0, input.length()-2)+"bcsr", std::ofstream::binary);
-		
-		outfile.write((char*)&num_nodes, sizeof(unsigned int));
-		outfile.write((char*)&num_edges, sizeof(unsigned int));
-		outfile.write ((char*)nodePointer, sizeof(unsigned int)*num_nodes);
-		outfile.write ((char*)edgeList, sizeof(OutEdge)*num_edges);
-		
-		outfile.close();
-	}
-	else if(GetFileExtension(input) == "wel")
-	{
-		ifstream infile;
-		infile.open(input);
-		stringstream ss;
-		uint max = 0;
-		string line;
-		uint edgeCounter = 0;
-		
-		vector<EdgeWeighted> edges;
-		EdgeWeighted newEdge;
-		while(getline( infile, line ))
-		{
-			ss.str("");
-			ss.clear();
-			ss << line;
-			
-			ss >> newEdge.source;
-			ss >> newEdge.end;
-			ss >> newEdge.w8;
-			
-			edges.push_back(newEdge);
-			edgeCounter++;
-			
-			if(max < newEdge.source)
-				max = newEdge.source;
-			if(max < newEdge.end)
-				max = newEdge.end;				
-		}			
-		infile.close();
-		
-		uint num_nodes = max + 1;
-		uint num_edges = edgeCounter;
-		uint *nodePointer = new uint[num_nodes+1];
-		OutEdgeWeighted *edgeList = new OutEdgeWeighted[num_edges];
-		uint *degree = new uint[num_nodes];
-		for(uint i=0; i<num_nodes; i++)
-			degree[i] = 0;
-		for(uint i=0; i<num_edges; i++)
-			degree[edges[i].source]++;
-		
-		uint counter=0;
-		for(uint i=0; i<num_nodes; i++)
-		{
-			nodePointer[i] = counter;
-			counter = counter + degree[i];
-		}
-		uint *outDegreeCounter  = new uint[num_nodes];
-		uint location;  
-		for(uint i=0; i<num_edges; i++)
-		{
-			uint location = nodePointer[edges[i].source] + outDegreeCounter[edges[i].source];
-			edgeList[location].end = edges[i].end;
-			edgeList[location].w8 = edges[i].w8;
-			outDegreeCounter[edges[i].source]++;  
-		}
-		edges.clear();
-		delete[] degree;
-		delete[] outDegreeCounter;
-		
-		std::ofstream outfile(input.substr(0, input.length()-3)+"bwcsr", std::ofstream::binary);
-		
-		outfile.write((char*)&num_nodes, sizeof(unsigned int));
-		outfile.write((char*)&num_edges, sizeof(unsigned int));
-		outfile.write ((char*)nodePointer, sizeof(unsigned int)*num_nodes);
-		outfile.write ((char*)edgeList, sizeof(OutEdgeWeighted)*num_edges);
-		
-		outfile.close();
-	}
-	else
-	{
-		cout << "\nInput file format is not supported.\n";
-		exit(0);
-	}
-
-}
diff --git a/tools/converter_stdin.cpp b/tools/converter_stdin.cpp
new file mode 100644
index 0000000..e18100f
--- /dev/null
+++ b/tools/converter_stdin.cpp
@@ -0,0 +1,182 @@
+#include "../shared/globals.hpp"
+#include <string>
+#include <iostream>
+
+bool IsWeightedFormat(string format)
+{
+	if((format == "bwcsr")	||
+		(format == "wcsr")	||
+		(format == "wel"))
+			return true;
+	return false;
+}
+
+string GetFileExtension(string fileName)
+{
+    if(fileName.find_last_of(".") != string::npos)
+        return fileName.substr(fileName.find_last_of(".")+1);
+    return "";
+}
+
+void save_edge_data_to_csr(const std::string& output_filename)
+{
+    uint max = 0;
+    ull edgeCounter = 0;
+    vector<Edge> edges;
+    Edge newEdge;
+
+    std::string delim = "\t";
+    for (std::string line; std::getline(std::cin, line);)
+    {
+        auto start = 0;
+        auto end = line.find(delim);
+        newEdge.source = static_cast<uint>(std::stoul(line.substr(start, end - start)));
+        start = end + delim.length();
+        newEdge.end = static_cast<uint>(std::stoul(line.substr(start)));
+        edges.push_back(newEdge);
+        edgeCounter++;
+        if(max < newEdge.source)
+              max = newEdge.source;
+        if(max < newEdge.end)
+              max = newEdge.end;
+    }
+    uint num_nodes = max + 1;
+    ull num_edges = edgeCounter;
+    ull *nodePointer = new ull[num_nodes+1];
+    OutEdge *edgeList = new OutEdge[num_edges];
+    //out degree
+    uint *degree = new uint[num_nodes];
+    for(uint i=0; i<num_nodes; i++)
+         degree[i] = 0;
+    for(ull i=0; i<num_edges; i++)
+         degree[edges[i].source]++;
+
+    ull counter=0;
+    for(uint i=0; i<num_nodes; i++)
+    {
+       nodePointer[i] = counter;
+       counter = counter + degree[i];
+    }
+    nodePointer[num_nodes] = num_edges;
+    uint *outDegreeCounter  = new uint[num_nodes];
+    for(uint i = 0; i < num_nodes; i++){
+            outDegreeCounter[i] = 0;
+    }
+    for(ull i=0; i < num_edges; i++)
+    {
+        if ( i % 20000 == 0)
+        {
+            std::cout << i << std::endl;
+        }
+        ull location = nodePointer[edges[i].source] + outDegreeCounter[edges[i].source];
+        edgeList[location].end = edges[i].end;
+        outDegreeCounter[edges[i].source]++;
+    }
+    edges.clear();
+    delete[] degree;
+    delete[] outDegreeCounter;
+
+    std::ofstream outfile(output_filename, std::ofstream::binary);
+    outfile.write((char*)&num_nodes, sizeof(uint));
+    outfile.write((char*)&num_edges, sizeof(ull));
+    ull n = num_nodes;
+    outfile.write ((char*)nodePointer, (n+1) * sizeof(ull));
+    outfile.write ((char*)edgeList, num_edges * sizeof(OutEdge) );
+    outfile.close();
+}
+
+
+void parseLine(std::string& line, EdgeWeighted& edge, const std::string& delimiter)
+{
+   auto start = 0;
+   auto end = line.find(delimiter);
+   edge.source = static_cast<uint>(std::stoul(line.substr(start, end - start)));
+   start = end + delimiter.length();
+   end = line.find(delimiter, start);
+   edge.end = static_cast<uint>(std::stoul(line.substr(start, end - start)));
+   start = end + delimiter.length();
+   edge.w8  =  static_cast<uint>(std::stoul(line.substr(start)));
+}
+
+void save_weighted_edge_data_to_csr(const std::string& output_filename)
+{
+   uint max = 0;
+   string line;
+   ull edgeCounter = 0;
+		
+   vector<EdgeWeighted> edges;
+   EdgeWeighted newEdge;
+   std::string delim = "\t";
+   for (std::string line; std::getline(std::cin, line);)
+   { 
+      parseLine(line, newEdge, delim);
+      edges.push_back(newEdge);
+      edgeCounter++;			
+      if(max < newEdge.source)
+	  max = newEdge.source;
+      if(max < newEdge.end)
+	 max = newEdge.end;				
+   }			
+   uint num_nodes = max + 1;
+   ull num_edges = edgeCounter;
+   ull *nodePointer = new ull[num_nodes+1];
+   OutEdgeWeighted *edgeList = new OutEdgeWeighted[num_edges];
+   uint *degree = new uint[num_nodes];
+   for(uint i=0; i<num_nodes; i++)
+       degree[i] = 0;
+   for(ull i=0; i<num_edges; i++)
+       degree[edges[i].source]++;
+		
+   ull counter=0;
+   for(uint i=0; i<num_nodes; i++)
+   {
+      nodePointer[i] = counter;
+      counter = counter + degree[i];
+   }
+   nodePointer[num_nodes] = num_edges;
+   uint *outDegreeCounter  = new uint[num_nodes];
+   for (uint i = 0; i < num_nodes; i++){
+       outDegreeCounter[i] = 0;
+   }
+   for(ull i=0; i<num_edges; i++)
+   {
+      ull location = nodePointer[edges[i].source] + outDegreeCounter[edges[i].source];
+      edgeList[location].end = edges[i].end;
+      edgeList[location].w8 = edges[i].w8;
+      outDegreeCounter[edges[i].source]++;  
+   }
+   edges.clear();
+   delete[] degree;
+   delete[] outDegreeCounter;
+
+   ull n = num_nodes;   
+   std::ofstream outfile(output_filename, std::ofstream::binary);
+   outfile.write((char*)&num_nodes, sizeof(uint));
+   outfile.write((char*)&num_edges, sizeof(ull));
+   outfile.write ((char*)nodePointer, (n + 1) * sizeof(ull));
+   outfile.write ((char*)edgeList, num_edges * sizeof(OutEdgeWeighted));
+   outfile.close();
+}
+
+int main(int argc, char** argv)
+{
+   if (argc != 3)
+   {
+      std::cout << "\n usage : cat data_file|converter_stdin file_type output_filename\n";
+      std::cout << "file_type: el or wel" << std::endl;
+      std::cout << "wel: weighted edge data (src TAB dst TAB weight(uint)" << std::endl;
+      std::cout << "el: edge data without weight (src TAB dst)" << std::endl;
+      exit(1);
+   }
+   std::string output_filename(argv[2]);
+   std::string file_type(argv[1]);	
+   if(file_type == "el")
+   {  
+      save_edge_data_to_csr(output_filename);
+   }
+   else if (file_type == "wel")
+   {
+     save_weighted_edge_data_to_csr(output_filename);
+   }
+   return 0;
+}