From aa06920e544df81186c8652b2cd9067f5542cff3 Mon Sep 17 00:00:00 2001 From: Ahmed Kadhim Date: Mon, 4 Nov 2024 10:55:03 +0000 Subject: [PATCH 01/22] add recom sys --- .gitignore | 3 + .../Applications/RecommendationSystems.py | 60 +++++++++++++++++++ examples/MNISTConvolutionDemo.py | 13 ---- examples/MNISTVanillaDemo.py | 15 ----- examples/NoisyXORDemo.py | 21 +------ examples/SequenceClassificationDemo.py | 14 ----- 6 files changed, 65 insertions(+), 61 deletions(-) create mode 100644 .gitignore create mode 100644 examples/Applications/RecommendationSystems.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..b4f6c42 --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +build/ +GraphTsetlinMachine.egg-info/ +/dist/ \ No newline at end of file diff --git a/examples/Applications/RecommendationSystems.py b/examples/Applications/RecommendationSystems.py new file mode 100644 index 0000000..56b73b2 --- /dev/null +++ b/examples/Applications/RecommendationSystems.py @@ -0,0 +1,60 @@ +from GraphTsetlinMachine.graphs import Graphs +import numpy as np +from scipy.sparse import csr_matrix +from GraphTsetlinMachine.tm import MultiClassGraphTsetlinMachine +from time import time +import argparse +import random +import pandas as pd +import kagglehub + +def default_args(**kwargs): + parser = argparse.ArgumentParser() + parser.add_argument("--epochs", default=10, type=int) + parser.add_argument("--number-of-clauses", default=10, type=int) + parser.add_argument("--T", default=100, type=int) + parser.add_argument("--s", default=1.0, type=float) + parser.add_argument("--number-of-state-bits", default=8, type=int) + parser.add_argument("--depth", default=2, type=int) + parser.add_argument("--hypervector-size", default=32, type=int) + parser.add_argument("--hypervector-bits", default=2, type=int) + parser.add_argument("--message-size", default=256, type=int) + parser.add_argument("--message-bits", default=2, type=int) + parser.add_argument('--double-hashing', dest='double_hashing', default=False, action='store_true') + parser.add_argument("--noise", default=0.01, type=float) + parser.add_argument("--number-of-examples", default=10000, type=int) + parser.add_argument("--max-included-literals", default=4, type=int) + + args = parser.parse_args() + for key, value in kwargs.items(): + if key in args.__dict__: + setattr(args, key, value) + return args + +args = default_args() + +print("Creating training data") +path = kagglehub.dataset_download("arhamrumi/amazon-product-reviews") +print("Path to dataset files:", path) +data_file = path + "/Reviews.csv" # Adjust this path if necessary +data = pd.read_csv(data_file) +print("Data preview:", data.head()) + +number_of_nodes = 3 + +symbols = [] +users = data['user_id'].unique() +items = data['product_id'].unique() +categories = data['category'].unique() + +# Initialize Graphs with symbols for GTM +num_graphs = len(items) +symbols = ["I" + str(i) for i in items] + ["C" + str(c) for c in categories] + ["U" + str(u) for u in users] + +graphs_train = Graphs( + X_train.shape[0], + symbols=symbols, + hypervector_size=args.hypervector_size, + hypervector_bits=args.hypervector_bits, + double_hashing = args.double_hashing +) \ No newline at end of file diff --git a/examples/MNISTConvolutionDemo.py b/examples/MNISTConvolutionDemo.py index 8fe7547..a9ee583 100644 --- a/examples/MNISTConvolutionDemo.py +++ b/examples/MNISTConvolutionDemo.py @@ -61,18 +61,13 @@ def default_args(**kwargs): hypervector_bits=args.hypervector_bits, double_hashing = args.double_hashing ) - for graph_id in range(X_train.shape[0]): graphs_train.set_number_of_graph_nodes(graph_id, number_of_nodes) - graphs_train.prepare_node_configuration() - for graph_id in range(X_train.shape[0]): for node_id in range(graphs_train.number_of_graph_nodes[graph_id]): graphs_train.add_graph_node(graph_id, node_id, 0) - graphs_train.prepare_edge_configuration() - for graph_id in range(X_train.shape[0]): if graph_id % 1000 == 0: print(graph_id, X_train.shape[0]) @@ -88,23 +83,17 @@ def default_args(**kwargs): graphs_train.add_graph_node_property(graph_id, node_id, "C:%d" % (q)) graphs_train.add_graph_node_property(graph_id, node_id, "R:%d" % (r)) - graphs_train.encode() - print("Training data produced") graphs_test = Graphs(X_test.shape[0], init_with=graphs_train) for graph_id in range(X_test.shape[0]): graphs_test.set_number_of_graph_nodes(graph_id, number_of_nodes) - graphs_test.prepare_node_configuration() - for graph_id in range(X_test.shape[0]): for node_id in range(graphs_test.number_of_graph_nodes[graph_id]): graphs_test.add_graph_node(graph_id, node_id, 0) - graphs_test.prepare_edge_configuration() - for graph_id in range(X_test.shape[0]): if graph_id % 1000 == 0: print(graph_id, X_test.shape[0]) @@ -120,9 +109,7 @@ def default_args(**kwargs): graphs_test.add_graph_node_property(graph_id, node_id, "C:%d" % (q)) graphs_test.add_graph_node_property(graph_id, node_id, "R:%d" % (r)) - graphs_test.encode() - print("Testing data produced") tm = MultiClassGraphTsetlinMachine( diff --git a/examples/MNISTVanillaDemo.py b/examples/MNISTVanillaDemo.py index 8bcb453..02b95e2 100644 --- a/examples/MNISTVanillaDemo.py +++ b/examples/MNISTVanillaDemo.py @@ -4,7 +4,6 @@ from GraphTsetlinMachine.tm import MultiClassGraphTsetlinMachine from time import time import argparse -from skimage.util import view_as_windows from keras.datasets import mnist from numba import jit @@ -53,51 +52,37 @@ def default_args(**kwargs): hypervector_bits=args.hypervector_bits, double_hashing = args.double_hashing ) - for graph_id in range(X_train.shape[0]): graphs_train.set_number_of_graph_nodes(graph_id, number_of_nodes) - graphs_train.prepare_node_configuration() - for graph_id in range(X_train.shape[0]): number_of_outgoing_edges = 0 graphs_train.add_graph_node(graph_id, 'Image Node', number_of_outgoing_edges) - graphs_train.prepare_edge_configuration() - for graph_id in range(X_train.shape[0]): if graph_id % 1000 == 0: print(graph_id, X_train.shape[0]) for k in X_train[graph_id].nonzero()[0]: graphs_train.add_graph_node_property(graph_id, 'Image Node', "W%d,%d" % (k // 28, k % 28)) - graphs_train.encode() - print("Training data produced") graphs_test = Graphs(X_test.shape[0], init_with=graphs_train) - for graph_id in range(X_test.shape[0]): graphs_test.set_number_of_graph_nodes(graph_id, number_of_nodes) - graphs_test.prepare_node_configuration() - for graph_id in range(X_test.shape[0]): number_of_outgoing_edges = 0 graphs_test.add_graph_node(graph_id, 'Image Node', number_of_outgoing_edges) - graphs_test.prepare_edge_configuration() - for graph_id in range(X_test.shape[0]): if graph_id % 1000 == 0: print(graph_id, X_test.shape[0]) for k in X_test[graph_id].nonzero()[0]: graphs_test.add_graph_node_property(graph_id, 'Image Node', "W%d,%d" % (k // 28, k % 28)) - graphs_test.encode() - print("Testing data produced") tm = MultiClassGraphTsetlinMachine( diff --git a/examples/NoisyXORDemo.py b/examples/NoisyXORDemo.py index 83a4bbd..3069207 100644 --- a/examples/NoisyXORDemo.py +++ b/examples/NoisyXORDemo.py @@ -34,31 +34,24 @@ def default_args(**kwargs): print("Creating training data") # Create train data - graphs_train = Graphs( args.number_of_examples, symbols=['A', 'B'], hypervector_size=args.hypervector_size, hypervector_bits=args.hypervector_bits, ) - for graph_id in range(args.number_of_examples): graphs_train.set_number_of_graph_nodes(graph_id, 2) - graphs_train.prepare_node_configuration() - for graph_id in range(args.number_of_examples): number_of_outgoing_edges = 1 graphs_train.add_graph_node(graph_id, 'Node 1', number_of_outgoing_edges) graphs_train.add_graph_node(graph_id, 'Node 2', number_of_outgoing_edges) - -graphs_train.prepare_edge_configuration() - +graphs_train.prepar_eedge_configuration() for graph_id in range(args.number_of_examples): edge_type = "Plain" graphs_train.add_graph_node_edge(graph_id, 'Node 1', 'Node 2', edge_type) graphs_train.add_graph_node_edge(graph_id, 'Node 2', 'Node 1', edge_type) - Y_train = np.empty(args.number_of_examples, dtype=np.uint32) for graph_id in range(args.number_of_examples): x1 = random.choice(['A', 'B']) @@ -74,32 +67,23 @@ def default_args(**kwargs): if np.random.rand() <= args.noise: Y_train[graph_id] = 1 - Y_train[graph_id] - graphs_train.encode() -# Create test data - +# Create test data print("Creating testing data") - graphs_test = Graphs(args.number_of_examples, init_with=graphs_train) - for graph_id in range(args.number_of_examples): graphs_test.set_number_of_graph_nodes(graph_id, 2) - graphs_test.prepare_node_configuration() - for graph_id in range(args.number_of_examples): number_of_outgoing_edges = 1 graphs_test.add_graph_node(graph_id, 'Node 1', number_of_outgoing_edges) graphs_test.add_graph_node(graph_id, 'Node 2', number_of_outgoing_edges) - graphs_test.prepare_edge_configuration() - for graph_id in range(args.number_of_examples): edge_type = "Plain" graphs_test.add_graph_node_edge(graph_id, 'Node 1', 'Node 2', edge_type) graphs_test.add_graph_node_edge(graph_id, 'Node 2', 'Node 1', edge_type) - Y_test = np.empty(args.number_of_examples, dtype=np.uint32) for graph_id in range(args.number_of_examples): x1 = random.choice(['A', 'B']) @@ -112,7 +96,6 @@ def default_args(**kwargs): Y_test[graph_id] = 0 else: Y_test[graph_id] = 1 - graphs_test.encode() tm = MultiClassGraphTsetlinMachine( diff --git a/examples/SequenceClassificationDemo.py b/examples/SequenceClassificationDemo.py index 7a2362c..c5b1321 100644 --- a/examples/SequenceClassificationDemo.py +++ b/examples/SequenceClassificationDemo.py @@ -35,7 +35,6 @@ def default_args(**kwargs): print("Creating training data") # Create train data - graphs_train = Graphs( args.number_of_examples, symbols=['A'], @@ -43,19 +42,14 @@ def default_args(**kwargs): hypervector_bits=args.hypervector_bits, double_hashing = args.double_hashing ) - for graph_id in range(args.number_of_examples): graphs_train.set_number_of_graph_nodes(graph_id, np.random.randint(args.number_of_classes, args.max_sequence_length+1)) - graphs_train.prepare_node_configuration() - for graph_id in range(args.number_of_examples): for node_id in range(graphs_train.number_of_graph_nodes[graph_id]): number_of_edges = 2 if node_id > 0 and node_id < graphs_train.number_of_graph_nodes[graph_id]-1 else 1 graphs_train.add_graph_node(graph_id, node_id, number_of_edges) - graphs_train.prepare_edge_configuration() - Y_train = np.empty(args.number_of_examples, dtype=np.uint32) for graph_id in range(args.number_of_examples): for node_id in range(graphs_train.number_of_graph_nodes[graph_id]): @@ -76,26 +70,19 @@ def default_args(**kwargs): if np.random.rand() <= args.noise: Y_train[graph_id] = np.random.choice(np.setdiff1d(np.arange(args.number_of_classes), [Y_train[graph_id]])) - graphs_train.encode() # Create test data - print("Creating testing data") - graphs_test = Graphs(args.number_of_examples, init_with=graphs_train) for graph_id in range(args.number_of_examples): graphs_test.set_number_of_graph_nodes(graph_id, np.random.randint(args.number_of_classes, args.max_sequence_length+1)) - graphs_test.prepare_node_configuration() - for graph_id in range(args.number_of_examples): for node_id in range(graphs_test.number_of_graph_nodes[graph_id]): number_of_edges = 2 if node_id > 0 and node_id < graphs_test.number_of_graph_nodes[graph_id]-1 else 1 graphs_test.add_graph_node(graph_id, node_id, number_of_edges) - graphs_test.prepare_edge_configuration() - Y_test = np.empty(args.number_of_examples, dtype=np.uint32) for graph_id in range(args.number_of_examples): for node_id in range(graphs_test.number_of_graph_nodes[graph_id]): @@ -113,7 +100,6 @@ def default_args(**kwargs): node_id = np.random.randint(Y_test[graph_id], graphs_test.number_of_graph_nodes[graph_id]) for node_pos in range(Y_test[graph_id] + 1): graphs_test.add_graph_node_property(graph_id, node_id - node_pos, 'A') - graphs_test.encode() tm = MultiClassGraphTsetlinMachine( From 6280bfbc95ab1f3a2ce80ccde68b16851293dbb6 Mon Sep 17 00:00:00 2001 From: Ahmed Kadhim Date: Mon, 4 Nov 2024 10:58:20 +0000 Subject: [PATCH 02/22] rename --- examples/{Applications => applications}/RecommendationSystems.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename examples/{Applications => applications}/RecommendationSystems.py (100%) diff --git a/examples/Applications/RecommendationSystems.py b/examples/applications/RecommendationSystems.py similarity index 100% rename from examples/Applications/RecommendationSystems.py rename to examples/applications/RecommendationSystems.py From 771edcf2af1bd1b4bb97606adcddbda472cdae0c Mon Sep 17 00:00:00 2001 From: Ahmed Kadhim Date: Wed, 6 Nov 2024 08:49:04 +0000 Subject: [PATCH 03/22] complete recom sys --- examples/MNISTVanillaDemo.py | 31 ++-- examples/NoisyXORMNISTDemo.py | 16 -- .../applications/RecommendationSystems.py | 165 ++++++++++++++++-- examples/applications/test.ipynb | 101 +++++++++++ 4 files changed, 264 insertions(+), 49 deletions(-) create mode 100644 examples/applications/test.ipynb diff --git a/examples/MNISTVanillaDemo.py b/examples/MNISTVanillaDemo.py index 02b95e2..4428343 100644 --- a/examples/MNISTVanillaDemo.py +++ b/examples/MNISTVanillaDemo.py @@ -60,9 +60,8 @@ def default_args(**kwargs): graphs_train.add_graph_node(graph_id, 'Image Node', number_of_outgoing_edges) graphs_train.prepare_edge_configuration() for graph_id in range(X_train.shape[0]): - if graph_id % 1000 == 0: - print(graph_id, X_train.shape[0]) - + # if graph_id % 1000 == 0: + # print(graph_id, X_train.shape[0]) for k in X_train[graph_id].nonzero()[0]: graphs_train.add_graph_node_property(graph_id, 'Image Node', "W%d,%d" % (k // 28, k % 28)) graphs_train.encode() @@ -110,16 +109,16 @@ def default_args(**kwargs): print("%d %.2f %.2f %.2f %.2f" % (i, result_train, result_test, stop_training-start_training, stop_testing-start_testing)) -weights = tm.get_state()[1].reshape(2, -1) -for i in range(tm.number_of_clauses): - print("Clause #%d Weights:(%d %d)" % (i, weights[0,i], weights[1,i]), end=' ') - l = [] - for k in range(args.hypervector_size * 2): - if tm.ta_action(0, i, k): - if k < args.hypervector_size: - l.append("x%d" % (k)) - else: - l.append("NOT x%d" % (k - args.hypervector_size)) - print(" AND ".join(l)) - -print(graphs_train.hypervectors) \ No newline at end of file +# weights = tm.get_state()[1].reshape(2, -1) +# for i in range(tm.number_of_clauses): +# print("Clause #%d Weights:(%d %d)" % (i, weights[0,i], weights[1,i]), end=' ') +# l = [] +# for k in range(args.hypervector_size * 2): +# if tm.ta_action(0, i, k): +# if k < args.hypervector_size: +# l.append("x%d" % (k)) +# else: +# l.append("NOT x%d" % (k - args.hypervector_size)) +# print(" AND ".join(l)) + +# print(graphs_train.hypervectors) \ No newline at end of file diff --git a/examples/NoisyXORMNISTDemo.py b/examples/NoisyXORMNISTDemo.py index ff1b315..5da4787 100644 --- a/examples/NoisyXORMNISTDemo.py +++ b/examples/NoisyXORMNISTDemo.py @@ -54,24 +54,18 @@ def default_args(**kwargs): hypervector_size=args.hypervector_size, hypervector_bits=args.hypervector_bits, ) - for graph_id in range(args.number_of_examples): graphs_train.set_number_of_graph_nodes(graph_id, 2) - graphs_train.prepare_node_configuration() - for graph_id in range(args.number_of_examples): number_of_outgoing_edges = 1 graphs_train.add_graph_node(graph_id, 'Node 1', number_of_outgoing_edges) graphs_train.add_graph_node(graph_id, 'Node 2', number_of_outgoing_edges) - graphs_train.prepare_edge_configuration() - for graph_id in range(args.number_of_examples): edge_type = "Plain" graphs_train.add_graph_node_edge(graph_id, 'Node 1', 'Node 2', edge_type) graphs_train.add_graph_node_edge(graph_id, 'Node 2', 'Node 1', edge_type) - Y_train = np.empty(args.number_of_examples, dtype=np.uint32) for graph_id in range(args.number_of_examples): x1 = random.choice([0, 1]) @@ -91,32 +85,23 @@ def default_args(**kwargs): if np.random.rand() <= args.noise: Y_train[graph_id] = 1 - Y_train[graph_id] - graphs_train.encode() # Create test data - print("Creating testing data") - graphs_test = Graphs(args.number_of_examples, init_with=graphs_train) - for graph_id in range(args.number_of_examples): graphs_test.set_number_of_graph_nodes(graph_id, 2) - graphs_test.prepare_node_configuration() - for graph_id in range(args.number_of_examples): number_of_outgoing_edges = 1 graphs_test.add_graph_node(graph_id, 'Node 1', number_of_outgoing_edges) graphs_test.add_graph_node(graph_id, 'Node 2', number_of_outgoing_edges) - graphs_test.prepare_edge_configuration() - for graph_id in range(args.number_of_examples): edge_type = "Plain" graphs_test.add_graph_node_edge(graph_id, 'Node 1', 'Node 2', edge_type) graphs_test.add_graph_node_edge(graph_id, 'Node 2', 'Node 1', edge_type) - Y_test = np.empty(args.number_of_examples, dtype=np.uint32) for graph_id in range(args.number_of_examples): x1 = random.choice([0, 1]) @@ -133,7 +118,6 @@ def default_args(**kwargs): Y_test[graph_id] = 0 else: Y_test[graph_id] = 1 - graphs_test.encode() tm = MultiClassGraphTsetlinMachine( diff --git a/examples/applications/RecommendationSystems.py b/examples/applications/RecommendationSystems.py index 56b73b2..8901911 100644 --- a/examples/applications/RecommendationSystems.py +++ b/examples/applications/RecommendationSystems.py @@ -1,25 +1,24 @@ from GraphTsetlinMachine.graphs import Graphs -import numpy as np -from scipy.sparse import csr_matrix from GraphTsetlinMachine.tm import MultiClassGraphTsetlinMachine from time import time import argparse -import random import pandas as pd import kagglehub +from sklearn.model_selection import train_test_split +from sklearn.preprocessing import LabelEncoder def default_args(**kwargs): parser = argparse.ArgumentParser() - parser.add_argument("--epochs", default=10, type=int) - parser.add_argument("--number-of-clauses", default=10, type=int) + parser.add_argument("--epochs", default=250, type=int) + parser.add_argument("--number-of-clauses", default=60, type=int) parser.add_argument("--T", default=100, type=int) - parser.add_argument("--s", default=1.0, type=float) + parser.add_argument("--s", default=10.0, type=float) parser.add_argument("--number-of-state-bits", default=8, type=int) parser.add_argument("--depth", default=2, type=int) - parser.add_argument("--hypervector-size", default=32, type=int) - parser.add_argument("--hypervector-bits", default=2, type=int) + parser.add_argument("--hypervector-size", default=1024, type=int) + parser.add_argument("--hypervector-bits", default=8, type=int) parser.add_argument("--message-size", default=256, type=int) - parser.add_argument("--message-bits", default=2, type=int) + parser.add_argument("--message-bits", default=8, type=int) parser.add_argument('--double-hashing', dest='double_hashing', default=False, action='store_true') parser.add_argument("--noise", default=0.01, type=float) parser.add_argument("--number-of-examples", default=10000, type=int) @@ -34,27 +33,159 @@ def default_args(**kwargs): args = default_args() print("Creating training data") -path = kagglehub.dataset_download("arhamrumi/amazon-product-reviews") +path = kagglehub.dataset_download("karkavelrajaj/amazon-sales-dataset") print("Path to dataset files:", path) -data_file = path + "/Reviews.csv" # Adjust this path if necessary +data_file = path + "/amazon.csv" data = pd.read_csv(data_file) -print("Data preview:", data.head()) +# print("Data preview:", data.head()) +data = data[['product_id', 'category', 'user_id', 'rating']] + +le_user = LabelEncoder() +le_item = LabelEncoder() +le_category = LabelEncoder() +le_rating = LabelEncoder() -number_of_nodes = 3 +data['user_id'] = le_user.fit_transform(data['user_id']) +data['product_id'] = le_item.fit_transform(data['product_id']) +data['category'] = le_category.fit_transform(data['category']) +data['rating'] = le_rating.fit_transform(data['rating']) + +x = data[['user_id', 'product_id', 'category']].values +y = data['rating'].values + +X_train, X_test, Y_train, Y_test = train_test_split(x, y, test_size=0.2, random_state=42) + +print("X_train shape:", X_train.shape) +print("y_train shape:", Y_train.shape) +print("X_test shape:", X_test.shape) +print("y_test shape:", Y_test.shape) -symbols = [] users = data['user_id'].unique() items = data['product_id'].unique() categories = data['category'].unique() # Initialize Graphs with symbols for GTM -num_graphs = len(items) -symbols = ["I" + str(i) for i in items] + ["C" + str(c) for c in categories] + ["U" + str(u) for u in users] +number_of_nodes = 3 +symbols = [] +symbols = ["U_" + str(u) for u in users] + ["I_" + str(i) for i in items] + ["C_" + str(c) for c in categories] +# Train data graphs_train = Graphs( X_train.shape[0], symbols=symbols, hypervector_size=args.hypervector_size, hypervector_bits=args.hypervector_bits, double_hashing = args.double_hashing -) \ No newline at end of file +) +for graph_id in range(X_train.shape[0]): + graphs_train.set_number_of_graph_nodes(graph_id, number_of_nodes) +graphs_train.prepare_node_configuration() +for graph_id in range(X_train.shape[0]): + for node_id in range(graphs_train.number_of_graph_nodes[graph_id]): + number_of_edges = 2 if node_id > 0 and node_id < graphs_train.number_of_graph_nodes[graph_id]-1 else 1 + if node_id == 0: + graphs_train.add_graph_node(graph_id, "User", number_of_edges) + elif node_id == 1: + graphs_train.add_graph_node(graph_id, "Item", number_of_edges) + else: + graphs_train.add_graph_node(graph_id, "Category", number_of_edges) +graphs_train.prepare_edge_configuration() +for graph_id in range(X_train.shape[0]): + for node_id in range(graphs_train.number_of_graph_nodes[graph_id]): + if node_id == 0: + graphs_train.add_graph_node_edge(graph_id, "User", "Item", "UserItem") + + if node_id == 1: + graphs_train.add_graph_node_edge(graph_id, "Item", "Category", "ItemCategory") + graphs_train.add_graph_node_edge(graph_id, "Item", "User", "ItemUser") + + if node_id == 2: + graphs_train.add_graph_node_edge(graph_id, "Category", "Item", "CatrgoryItem") + + graphs_train.add_graph_node_property(graph_id, "User", "U_" + str(X_train[graph_id][0])) + graphs_train.add_graph_node_property(graph_id, "Item", "I_" + str(X_train[graph_id][1])) + graphs_train.add_graph_node_property(graph_id, "Category", "C_" + str(X_train[graph_id][2])) +graphs_train.encode() +print("Training data produced") + +# Test data +graphs_test = Graphs(X_test.shape[0], init_with=graphs_train) +for graph_id in range(X_test.shape[0]): + graphs_test.set_number_of_graph_nodes(graph_id, number_of_nodes) +graphs_test.prepare_node_configuration() +for graph_id in range(X_test.shape[0]): + for node_id in range(graphs_test.number_of_graph_nodes[graph_id]): + number_of_edges = 2 if node_id > 0 and node_id < graphs_test.number_of_graph_nodes[graph_id]-1 else 1 + if node_id == 0: + graphs_test.add_graph_node(graph_id, "User", number_of_edges) + elif node_id == 1: + graphs_test.add_graph_node(graph_id, "Item", number_of_edges) + else: + graphs_test.add_graph_node(graph_id, "Category", number_of_edges) +graphs_test.prepare_edge_configuration() +for graph_id in range(X_test.shape[0]): + for node_id in range(graphs_test.number_of_graph_nodes[graph_id]): + if node_id == 0: + graphs_test.add_graph_node_edge(graph_id, "User", "Item", "UserItem") + + if node_id == 1: + graphs_test.add_graph_node_edge(graph_id, "Item", "Category", "ItemCategory") + graphs_test.add_graph_node_edge(graph_id, "Item", "User", "ItemUser") + + if node_id == 2: + graphs_test.add_graph_node_edge(graph_id, "Category", "Item", "CatrgoryItem") + + graphs_test.add_graph_node_property(graph_id, "User", "U_" + str(X_test[graph_id][0])) + graphs_test.add_graph_node_property(graph_id, "Item", "I_" + str(X_test[graph_id][1])) + graphs_test.add_graph_node_property(graph_id, "Category", "C_" + str(X_test[graph_id][2])) +graphs_test.encode() +print("Testing data produced") + +tm = MultiClassGraphTsetlinMachine( + args.number_of_clauses, + args.T, + args.s, + number_of_state_bits = args.number_of_state_bits, + depth=args.depth, + message_size=args.message_size, + message_bits=args.message_bits, + max_included_literals=args.max_included_literals, + double_hashing = args.double_hashing +) + +for i in range(args.epochs): + start_training = time() + tm.fit(graphs_train, Y_train, epochs=1, incremental=True) + stop_training = time() + + start_testing = time() + result_test = 100*(tm.predict(graphs_test) == Y_test).mean() + stop_testing = time() + + result_train = 100*(tm.predict(graphs_train) == Y_train).mean() + + print("%d %.2f %.2f %.2f %.2f" % (i, result_train, result_test, stop_training-start_training, stop_testing-start_testing)) + +# weights = tm.get_state()[1].reshape(2, -1) +# for i in range(tm.number_of_clauses): +# print("Clause #%d W:(%d %d)" % (i, weights[0,i], weights[1,i]), end=' ') +# l = [] +# for k in range(args.hypervector_size * 2): +# if tm.ta_action(0, i, k): +# if k < args.hypervector_size: +# l.append("x%d" % (k)) +# else: +# l.append("NOT x%d" % (k - args.hypervector_size)) + +# for k in range(args.message_size * 2): +# if tm.ta_action(1, i, k): +# if k < args.message_size: +# l.append("c%d" % (k)) +# else: +# l.append("NOT c%d" % (k - args.message_size)) + +# print(" AND ".join(l)) + +# print(graphs_test.hypervectors) +# print(tm.hypervectors) +# print(graphs_test.edge_type_id) \ No newline at end of file diff --git a/examples/applications/test.ipynb b/examples/applications/test.ipynb new file mode 100644 index 0000000..44e0294 --- /dev/null +++ b/examples/applications/test.ipynb @@ -0,0 +1,101 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Creating training data\n", + "Path to dataset files: /root/.cache/kagglehub/datasets/karkavelrajaj/amazon-sales-dataset/versions/1\n", + "Electronics|HomeTheater,TV&Video|Accessories|RemoteControls\n", + "X_train shape: (1172, 3)\n", + "y_train shape: (1172,)\n", + "X_test shape: (293, 3)\n", + "y_test shape: (293,)\n", + "111\n", + "Electronics|HomeTheater,TV&Video|Accessories|RemoteControls\n" + ] + } + ], + "source": [ + "from GraphTsetlinMachine.graphs import Graphs\n", + "import numpy as np\n", + "from scipy.sparse import csr_matrix\n", + "from GraphTsetlinMachine.tm import MultiClassGraphTsetlinMachine\n", + "from time import time\n", + "import argparse\n", + "import random\n", + "import pandas as pd\n", + "import kagglehub\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.preprocessing import LabelEncoder\n", + "\n", + "\n", + "print(\"Creating training data\")\n", + "path = kagglehub.dataset_download(\"karkavelrajaj/amazon-sales-dataset\")\n", + "print(\"Path to dataset files:\", path)\n", + "data_file = path + \"/amazon.csv\" # Adjust this path if necessary\n", + "data = pd.read_csv(data_file)\n", + "# print(\"Data preview:\", data.head())\n", + "data = data[['product_id', 'category', 'user_id', 'rating']]\n", + "print(data['category'][100])\n", + " \n", + "# Step 2: Encode user_id, product_id, and category with LabelEncoder\n", + "# This converts string identifiers into unique integer values\n", + "le_user = LabelEncoder()\n", + "le_item = LabelEncoder()\n", + "le_category = LabelEncoder()\n", + "\n", + "data['user_id'] = le_user.fit_transform(data['user_id'])\n", + "data['product_id'] = le_item.fit_transform(data['product_id'])\n", + "data['category'] = le_category.fit_transform(data['category'])\n", + "\n", + "# Step 3: Prepare X (features) and y (labels)\n", + "x = data[['user_id', 'product_id', 'category']].values # Features: [user, item, category]\n", + "y = data['rating'].values # Labels: rating\n", + "\n", + "# Step 4: Split the data into training and test sets\n", + "X_train, X_test, Y_train, Y_test = train_test_split(x, y, test_size=0.2, random_state=42)\n", + "\n", + "# Display the shapes to verify the split\n", + "print(\"X_train shape:\", X_train.shape)\n", + "print(\"y_train shape:\", Y_train.shape)\n", + "print(\"X_test shape:\", X_test.shape)\n", + "print(\"y_test shape:\", Y_test.shape)\n", + "\n", + "users = data['user_id'].unique()\n", + "items = data['product_id'].unique()\n", + "categories = data['category'].unique()\n", + "\n", + "print(categories[100])\n", + "original_user_id = le_category.inverse_transform([data['category'][100]])[0]\n", + "print(original_user_id)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From ec3fc8725952d91e73fe17f0ad6a3628afa6ccd8 Mon Sep 17 00:00:00 2001 From: Ahmed Khalid Date: Wed, 4 Dec 2024 11:29:05 +0000 Subject: [PATCH 04/22] rename --- .devcontainer/devcontainer.json | 4 ++-- .devcontainer/docker-compose.yml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json index e500cf2..b264ff4 100644 --- a/.devcontainer/devcontainer.json +++ b/.devcontainer/devcontainer.json @@ -1,7 +1,7 @@ { - "name": "TM Graph Devcontainer", + "name": "TM Graph Recomm", "dockerComposeFile": "docker-compose.yml", - "service": "tm-graph-development", + "service": "tm-graph-recomm", "workspaceFolder": "/app", "forwardPorts": [], "postCreateCommand": "echo 'Devcontainer is ready'", diff --git a/.devcontainer/docker-compose.yml b/.devcontainer/docker-compose.yml index 183c3ac..0dccd18 100644 --- a/.devcontainer/docker-compose.yml +++ b/.devcontainer/docker-compose.yml @@ -1,5 +1,5 @@ services: - tm-graph-development: + tm-graph-recomm: build: context: ../ dockerfile: .devcontainer/Dockerfile From 08693ab145312d82fe5e99bb04bb82a2e9a35194 Mon Sep 17 00:00:00 2001 From: Ahmed Khalid Date: Wed, 4 Dec 2024 13:22:41 +0000 Subject: [PATCH 05/22] tunning --- examples/applications/RecommendationSystems.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/examples/applications/RecommendationSystems.py b/examples/applications/RecommendationSystems.py index 8901911..016b154 100644 --- a/examples/applications/RecommendationSystems.py +++ b/examples/applications/RecommendationSystems.py @@ -10,18 +10,18 @@ def default_args(**kwargs): parser = argparse.ArgumentParser() parser.add_argument("--epochs", default=250, type=int) - parser.add_argument("--number-of-clauses", default=60, type=int) - parser.add_argument("--T", default=100, type=int) + parser.add_argument("--number-of-clauses", default=1000, type=int) + parser.add_argument("--T", default=1000, type=int) parser.add_argument("--s", default=10.0, type=float) parser.add_argument("--number-of-state-bits", default=8, type=int) - parser.add_argument("--depth", default=2, type=int) - parser.add_argument("--hypervector-size", default=1024, type=int) - parser.add_argument("--hypervector-bits", default=8, type=int) - parser.add_argument("--message-size", default=256, type=int) - parser.add_argument("--message-bits", default=8, type=int) + parser.add_argument("--depth", default=3, type=int) + parser.add_argument("--hypervector-size", default=16384, type=int) + parser.add_argument("--hypervector-bits", default=328, type=int) + parser.add_argument("--message-size", default=1024, type=int) + parser.add_argument("--message-bits", default=32, type=int) parser.add_argument('--double-hashing', dest='double_hashing', default=False, action='store_true') parser.add_argument("--noise", default=0.01, type=float) - parser.add_argument("--number-of-examples", default=10000, type=int) + parser.add_argument("--number-of-examples", default=1000, type=int) parser.add_argument("--max-included-literals", default=4, type=int) args = parser.parse_args() @@ -68,7 +68,7 @@ def default_args(**kwargs): number_of_nodes = 3 symbols = [] symbols = ["U_" + str(u) for u in users] + ["I_" + str(i) for i in items] + ["C_" + str(c) for c in categories] - +print(len(symbols)) # Train data graphs_train = Graphs( X_train.shape[0], From 9c4be1f888844ae37879a6fc97ff68561d4f62d2 Mon Sep 17 00:00:00 2001 From: Ahmed Khalid Date: Mon, 16 Dec 2024 12:19:18 +0000 Subject: [PATCH 06/22] update --- examples/applications/RecommendationSystems.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/applications/RecommendationSystems.py b/examples/applications/RecommendationSystems.py index 016b154..4cb0751 100644 --- a/examples/applications/RecommendationSystems.py +++ b/examples/applications/RecommendationSystems.py @@ -16,7 +16,7 @@ def default_args(**kwargs): parser.add_argument("--number-of-state-bits", default=8, type=int) parser.add_argument("--depth", default=3, type=int) parser.add_argument("--hypervector-size", default=16384, type=int) - parser.add_argument("--hypervector-bits", default=328, type=int) + parser.add_argument("--hypervector-bits", default=496, type=int) parser.add_argument("--message-size", default=1024, type=int) parser.add_argument("--message-bits", default=32, type=int) parser.add_argument('--double-hashing', dest='double_hashing', default=False, action='store_true') From daf8d5ad1f8319beafbe7d4c654bd1db02695a2c Mon Sep 17 00:00:00 2001 From: Ahmed Khalid Date: Tue, 17 Dec 2024 14:39:07 +0000 Subject: [PATCH 07/22] update --- .../applications/RecommendationSystems.py | 50 ++++++++++++++----- 1 file changed, 38 insertions(+), 12 deletions(-) diff --git a/examples/applications/RecommendationSystems.py b/examples/applications/RecommendationSystems.py index 4cb0751..7e2cdee 100644 --- a/examples/applications/RecommendationSystems.py +++ b/examples/applications/RecommendationSystems.py @@ -3,6 +3,7 @@ from time import time import argparse import pandas as pd +import numpy as np import kagglehub from sklearn.model_selection import train_test_split from sklearn.preprocessing import LabelEncoder @@ -15,13 +16,13 @@ def default_args(**kwargs): parser.add_argument("--s", default=10.0, type=float) parser.add_argument("--number-of-state-bits", default=8, type=int) parser.add_argument("--depth", default=3, type=int) - parser.add_argument("--hypervector-size", default=16384, type=int) - parser.add_argument("--hypervector-bits", default=496, type=int) - parser.add_argument("--message-size", default=1024, type=int) - parser.add_argument("--message-bits", default=32, type=int) + parser.add_argument("--hypervector-size", default=1024, type=int) + parser.add_argument("--hypervector-bits", default=10, type=int) + parser.add_argument("--message-size", default=512, type=int) + parser.add_argument("--message-bits", default=10, type=int) parser.add_argument('--double-hashing', dest='double_hashing', default=False, action='store_true') parser.add_argument("--noise", default=0.01, type=float) - parser.add_argument("--number-of-examples", default=1000, type=int) + parser.add_argument("--number-of-examples", default=500, type=int) parser.add_argument("--max-included-literals", default=4, type=int) args = parser.parse_args() @@ -32,13 +33,38 @@ def default_args(**kwargs): args = default_args() -print("Creating training data") -path = kagglehub.dataset_download("karkavelrajaj/amazon-sales-dataset") -print("Path to dataset files:", path) -data_file = path + "/amazon.csv" -data = pd.read_csv(data_file) -# print("Data preview:", data.head()) -data = data[['product_id', 'category', 'user_id', 'rating']] +# print("Creating training data") +# path = kagglehub.dataset_download("karkavelrajaj/amazon-sales-dataset") +# print("Path to dataset files:", path) +# data_file = path + "/amazon.csv" +# data = pd.read_csv(data_file) +# # print("Data preview:", data.head()) +# data = data[['product_id', 'category', 'user_id', 'rating']] + +############################# artificial dataset ######################## +# Set random seed for reproducibility +np.random.seed(42) +# Define the size of the artificial dataset +num_users = 10 # Number of unique users +num_items = 50 # Number of unique items +num_categories = 10 # Number of unique categories +num_interactions = 10000 # Number of user-item interactions +# Generate random ratings (e.g., between 1 and 5) +ratings = np.random.choice(range(1, 3), num_interactions) +# Generate random user-item interactions +user_ids = np.random.choice(range(num_users), num_interactions) +item_ids = np.random.choice(range(num_items), num_interactions) +categories = np.random.choice(range(num_categories), num_interactions) +# Combine into a DataFrame +data = pd.DataFrame({ + 'user_id': user_ids, + 'product_id': item_ids, + 'category': categories, + 'rating': ratings +}) +print("Artificial Dataset Preview:") +print(data.head()) +######################################################################## le_user = LabelEncoder() le_item = LabelEncoder() From 9dacba5364e8abc3f9c746399f8c5185d4410cad Mon Sep 17 00:00:00 2001 From: Ahmed Khalid Date: Wed, 18 Dec 2024 10:25:12 +0000 Subject: [PATCH 08/22] update --- examples/applications/RecommendationSystems.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/applications/RecommendationSystems.py b/examples/applications/RecommendationSystems.py index 7e2cdee..a453d42 100644 --- a/examples/applications/RecommendationSystems.py +++ b/examples/applications/RecommendationSystems.py @@ -19,7 +19,7 @@ def default_args(**kwargs): parser.add_argument("--hypervector-size", default=1024, type=int) parser.add_argument("--hypervector-bits", default=10, type=int) parser.add_argument("--message-size", default=512, type=int) - parser.add_argument("--message-bits", default=10, type=int) + parser.add_argument("--message-bits", default=2, type=int) parser.add_argument('--double-hashing', dest='double_hashing', default=False, action='store_true') parser.add_argument("--noise", default=0.01, type=float) parser.add_argument("--number-of-examples", default=500, type=int) @@ -48,7 +48,7 @@ def default_args(**kwargs): num_users = 10 # Number of unique users num_items = 50 # Number of unique items num_categories = 10 # Number of unique categories -num_interactions = 10000 # Number of user-item interactions +num_interactions = 100000 # Number of user-item interactions # Generate random ratings (e.g., between 1 and 5) ratings = np.random.choice(range(1, 3), num_interactions) # Generate random user-item interactions From 3dd2b7c9f2d116aa7308bb1677449e7c3a798a5d Mon Sep 17 00:00:00 2001 From: Ahmed Khalid Date: Wed, 18 Dec 2024 10:35:51 +0000 Subject: [PATCH 09/22] run on gpu 6 --- .devcontainer/docker-compose.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.devcontainer/docker-compose.yml b/.devcontainer/docker-compose.yml index 0dccd18..46271d0 100644 --- a/.devcontainer/docker-compose.yml +++ b/.devcontainer/docker-compose.yml @@ -9,4 +9,5 @@ services: devices: - driver: nvidia capabilities: [gpu] - count: 1 # Assign number of GPUs or use 'all' to assign all available GPUs \ No newline at end of file + # count: 1 # Assign number of GPUs or use 'all' to assign all available GPUs + device_ids: ["6"] \ No newline at end of file From e9bdcd6a605e95756ad34caad8cabc811b563b35 Mon Sep 17 00:00:00 2001 From: Ahmed Khalid Date: Wed, 18 Dec 2024 10:44:45 +0000 Subject: [PATCH 10/22] add requirments --- requirments.txt | 7 +++++++ 1 file changed, 7 insertions(+) create mode 100644 requirments.txt diff --git a/requirments.txt b/requirments.txt new file mode 100644 index 0000000..12b86c0 --- /dev/null +++ b/requirments.txt @@ -0,0 +1,7 @@ +numpy +numba +pycuda +scipy +pandas +kagglehub +scikit-learn \ No newline at end of file From da31b30562feb78fd0ed42869c144c3f0dc4a2d6 Mon Sep 17 00:00:00 2001 From: Ahmed Khalid Date: Wed, 18 Dec 2024 13:27:36 +0000 Subject: [PATCH 11/22] update --- .../applications/RecommendationSystems.py | 166 ++++++++++++++---- 1 file changed, 130 insertions(+), 36 deletions(-) diff --git a/examples/applications/RecommendationSystems.py b/examples/applications/RecommendationSystems.py index a453d42..eea88e7 100644 --- a/examples/applications/RecommendationSystems.py +++ b/examples/applications/RecommendationSystems.py @@ -11,18 +11,17 @@ def default_args(**kwargs): parser = argparse.ArgumentParser() parser.add_argument("--epochs", default=250, type=int) - parser.add_argument("--number-of-clauses", default=1000, type=int) - parser.add_argument("--T", default=1000, type=int) + parser.add_argument("--number-of-clauses", default=10000, type=int) + parser.add_argument("--T", default=10000, type=int) parser.add_argument("--s", default=10.0, type=float) parser.add_argument("--number-of-state-bits", default=8, type=int) - parser.add_argument("--depth", default=3, type=int) - parser.add_argument("--hypervector-size", default=1024, type=int) - parser.add_argument("--hypervector-bits", default=10, type=int) - parser.add_argument("--message-size", default=512, type=int) - parser.add_argument("--message-bits", default=2, type=int) + parser.add_argument("--depth", default=1, type=int) + parser.add_argument("--hypervector-size", default=4096, type=int) + parser.add_argument("--hypervector-bits", default=256, type=int) + parser.add_argument("--message-size", default=4096, type=int) + parser.add_argument("--message-bits", default=256, type=int) parser.add_argument('--double-hashing', dest='double_hashing', default=False, action='store_true') parser.add_argument("--noise", default=0.01, type=float) - parser.add_argument("--number-of-examples", default=500, type=int) parser.add_argument("--max-included-literals", default=4, type=int) args = parser.parse_args() @@ -33,38 +32,133 @@ def default_args(**kwargs): args = default_args() -# print("Creating training data") -# path = kagglehub.dataset_download("karkavelrajaj/amazon-sales-dataset") -# print("Path to dataset files:", path) -# data_file = path + "/amazon.csv" -# data = pd.read_csv(data_file) -# # print("Data preview:", data.head()) -# data = data[['product_id', 'category', 'user_id', 'rating']] +############################# real dataset ######################## + +print("Creating training data") +path = kagglehub.dataset_download("karkavelrajaj/amazon-sales-dataset") +print("Path to dataset files:", path) +data_file = path + "/amazon.csv" +data = pd.read_csv(data_file) +# print("Data preview:", data.head()) +data = data[['product_id', 'category', 'user_id', 'rating']] ############################# artificial dataset ######################## + # Set random seed for reproducibility -np.random.seed(42) -# Define the size of the artificial dataset -num_users = 10 # Number of unique users -num_items = 50 # Number of unique items -num_categories = 10 # Number of unique categories -num_interactions = 100000 # Number of user-item interactions -# Generate random ratings (e.g., between 1 and 5) -ratings = np.random.choice(range(1, 3), num_interactions) -# Generate random user-item interactions -user_ids = np.random.choice(range(num_users), num_interactions) -item_ids = np.random.choice(range(num_items), num_interactions) -categories = np.random.choice(range(num_categories), num_interactions) -# Combine into a DataFrame -data = pd.DataFrame({ - 'user_id': user_ids, - 'product_id': item_ids, - 'category': categories, - 'rating': ratings -}) -print("Artificial Dataset Preview:") -print(data.head()) +# np.random.seed(42) + +########################## ver 1 ############################ + +# num_users = 5 # Number of unique users +# num_items =10 # Number of unique items +# num_categories = 5 # Number of unique categories +# num_interactions = 1000 # Number of user-item interactions +# # Generate random ratings (e.g., between 1 and 5) +# ratings = np.random.choice(range(1, 3), num_interactions) +# # Generate random user-item interactions +# user_ids = np.random.choice(range(num_users), num_interactions) +# item_ids = np.random.choice(range(num_items), num_interactions) +# categories = np.random.choice(range(num_categories), num_interactions) + +# data = pd.DataFrame({ +# 'user_id': user_ids, +# 'product_id': item_ids, +# 'category': categories, +# 'rating': ratings +# }) +# print("Artificial Dataset Preview:") + +########################## ver 2 ############################ + +# Parameters +# num_users = 100 # Number of unique users +# num_items = 50 # Number of unique items +# num_categories = 50 # Number of unique categories +# num_interactions = 1000 # Number of user-item interactions +# noise_ratio = 0.01 # Percentage of noisy interactions + +# # Generate user preferences: each user prefers 1-3 random categories +# user_preferences = { +# user: np.random.choice(range(num_categories), size=np.random.randint(1, 4), replace=False) +# for user in range(num_users) +# } + +# # Assign each item to a category +# item_categories = {item: np.random.choice(range(num_categories)) for item in range(num_items)} + +# # Generate interactions +# user_ids = np.random.choice(range(num_users), num_interactions) +# item_ids = np.random.choice(range(num_items), num_interactions) + +# # Generate ratings based on the pattern +# ratings = [] +# for user, item in zip(user_ids, item_ids): +# item_category = item_categories[item] +# if item_category in user_preferences[user]: +# ratings.append(np.random.choice([3, 4])) # High rating for preferred categories +# else: +# ratings.append(np.random.choice([1, 2])) # Low rating otherwise + +# # Introduce noise +# num_noisy = int(noise_ratio * num_interactions) +# noisy_indices = np.random.choice(range(num_interactions), num_noisy, replace=False) +# for idx in noisy_indices: +# ratings[idx] = np.random.choice(range(1, 6)) # Replace with random rating + +# # Combine into a DataFrame +# data = pd.DataFrame({ +# 'user_id': user_ids, +# 'product_id': item_ids, +# 'category': [item_categories[item] for item in item_ids], +# 'rating': ratings +# }) +# print("Artificial Dataset Preview:") + +########################### ver 3 ############################## + +# Parameters +# num_users = 100 # Number of unique users +# num_items = 50 # Number of unique items +# num_categories = 5 # Number of unique categories +# num_interactions = 10000 # Number of user-item interactions +# noise_ratio = 0.01 # Percentage of noisy interactions + +# # Step 1: Define deterministic user preferences +# user_preferences = {user: user % num_categories for user in range(num_users)} + +# # Step 2: Assign items to categories in a cyclic pattern +# item_categories = {item: item % num_categories for item in range(num_items)} + +# # Step 3: Generate deterministic interactions +# user_ids = np.arange(num_interactions) % num_users # Cycle through users +# item_ids = np.arange(num_interactions) % num_items # Cycle through items + +# # Step 4: Generate ratings based on the pattern +# ratings = [] +# for user, item in zip(user_ids, item_ids): +# preferred_category = user_preferences[user] +# item_category = item_categories[item] +# if item_category == preferred_category: +# ratings.append(5) # High rating for preferred category +# else: +# ratings.append(1) # Low rating otherwise + +# # Step 5: Introduce noise +# num_noisy = int(noise_ratio * num_interactions) +# noisy_indices = np.random.choice(range(num_interactions), num_noisy, replace=False) +# for idx in noisy_indices: +# ratings[idx] = np.random.choice(range(1, 6)) # Replace with random rating + +# # Step 6: Create a DataFrame +# data = pd.DataFrame({ +# 'user_id': user_ids, +# 'product_id': item_ids, +# 'category': [item_categories[item] for item in item_ids], +# 'rating': ratings +# }) + ######################################################################## +print(data.head()) le_user = LabelEncoder() le_item = LabelEncoder() From fababa59963ca02253cdb8ff9a1f9127d228c607 Mon Sep 17 00:00:00 2001 From: Ahmed Khalid Date: Wed, 18 Dec 2024 13:47:42 +0000 Subject: [PATCH 12/22] expanded ds --- .../applications/RecommendationSystems.py | 34 +++++++++++++++++-- 1 file changed, 31 insertions(+), 3 deletions(-) diff --git a/examples/applications/RecommendationSystems.py b/examples/applications/RecommendationSystems.py index eea88e7..db7000f 100644 --- a/examples/applications/RecommendationSystems.py +++ b/examples/applications/RecommendationSystems.py @@ -38,10 +38,38 @@ def default_args(**kwargs): path = kagglehub.dataset_download("karkavelrajaj/amazon-sales-dataset") print("Path to dataset files:", path) data_file = path + "/amazon.csv" -data = pd.read_csv(data_file) +org_data = pd.read_csv(data_file) # print("Data preview:", data.head()) -data = data[['product_id', 'category', 'user_id', 'rating']] - +org_data = org_data[['product_id', 'category', 'user_id', 'rating']] +#################################### expanded +org_data['rating'] = pd.to_numeric(org_data['rating'], errors='coerce') # Coerce invalid values to NaN +org_data.dropna(subset=['rating'], inplace=True) # Drop rows with NaN ratings +org_data['rating'] = org_data['rating'].astype(int) +# Expand the dataset 10 times +data = pd.concat([org_data] * 10, ignore_index=True) + +# Shuffle the expanded dataset +data = data.sample(frac=1, random_state=42).reset_index(drop=True) + +# Add noise +# Define the noise ratio +noise_ratio = 0.1 # 10% noise + +# Select rows to apply noise +num_noisy_rows = int(noise_ratio * len(data)) +noisy_indices = np.random.choice(data.index, size=num_noisy_rows, replace=False) + +# Add noise to ratings +data.loc[noisy_indices, 'rating'] = np.random.choice(range(1, 6), size=num_noisy_rows) + +# Add noise to categories +unique_categories = data['category'].unique() +data.loc[noisy_indices, 'category'] = np.random.choice(unique_categories, size=num_noisy_rows) + +# Print a preview of the noisy and expanded dataset +print("Original data shape:", org_data.shape) +print("Expanded data shape:", data.shape) +print("Data preview:\n", data.head()) ############################# artificial dataset ######################## # Set random seed for reproducibility From 82305ab67649e458064ee4bfb5d732da250b433b Mon Sep 17 00:00:00 2001 From: Ahmed Khalid Date: Thu, 19 Dec 2024 14:57:03 +0000 Subject: [PATCH 13/22] update --- .../applications/RecommendationSystems.py | 2 +- examples/applications/test.ipynb | 226 +++++++++++++++--- 2 files changed, 199 insertions(+), 29 deletions(-) diff --git a/examples/applications/RecommendationSystems.py b/examples/applications/RecommendationSystems.py index db7000f..4a1daa4 100644 --- a/examples/applications/RecommendationSystems.py +++ b/examples/applications/RecommendationSystems.py @@ -22,7 +22,7 @@ def default_args(**kwargs): parser.add_argument("--message-bits", default=256, type=int) parser.add_argument('--double-hashing', dest='double_hashing', default=False, action='store_true') parser.add_argument("--noise", default=0.01, type=float) - parser.add_argument("--max-included-literals", default=4, type=int) + parser.add_argument("--max-included-literals", default=10, type=int) args = parser.parse_args() for key, value in kwargs.items(): diff --git a/examples/applications/test.ipynb b/examples/applications/test.ipynb index 44e0294..7d389f1 100644 --- a/examples/applications/test.ipynb +++ b/examples/applications/test.ipynb @@ -2,66 +2,138 @@ "cells": [ { "cell_type": "code", - "execution_count": 19, + "execution_count": 1, "metadata": {}, "outputs": [ { - "name": "stdout", + "name": "stderr", "output_type": "stream", "text": [ - "Creating training data\n", - "Path to dataset files: /root/.cache/kagglehub/datasets/karkavelrajaj/amazon-sales-dataset/versions/1\n", - "Electronics|HomeTheater,TV&Video|Accessories|RemoteControls\n", - "X_train shape: (1172, 3)\n", - "y_train shape: (1172,)\n", - "X_test shape: (293, 3)\n", - "y_test shape: (293,)\n", - "111\n", - "Electronics|HomeTheater,TV&Video|Accessories|RemoteControls\n" + "/usr/local/lib/python3.10/dist-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n", + "usage: ipykernel_launcher.py [-h] [--epochs EPOCHS]\n", + " [--number-of-clauses NUMBER_OF_CLAUSES] [--T T]\n", + " [--s S]\n", + " [--number-of-state-bits NUMBER_OF_STATE_BITS]\n", + " [--depth DEPTH]\n", + " [--hypervector-size HYPERVECTOR_SIZE]\n", + " [--hypervector-bits HYPERVECTOR_BITS]\n", + " [--message-size MESSAGE_SIZE]\n", + " [--message-bits MESSAGE_BITS] [--double-hashing]\n", + " [--noise NOISE]\n", + " [--max-included-literals MAX_INCLUDED_LITERALS]\n", + "ipykernel_launcher.py: error: unrecognized arguments: --f=/root/.local/share/jupyter/runtime/kernel-v3a1695e0e67c01cd0a818bc897e0f886c634ee3d4.json\n" + ] + }, + { + "ename": "SystemExit", + "evalue": "2", + "output_type": "error", + "traceback": [ + "An exception has occurred, use %tb to see the full traceback.\n", + "\u001b[0;31mSystemExit\u001b[0m\u001b[0;31m:\u001b[0m 2\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/root/.local/lib/python3.10/site-packages/IPython/core/interactiveshell.py:3585: UserWarning: To exit: use 'exit', 'quit', or Ctrl-D.\n", + " warn(\"To exit: use 'exit', 'quit', or Ctrl-D.\", stacklevel=1)\n" ] } ], "source": [ "from GraphTsetlinMachine.graphs import Graphs\n", - "import numpy as np\n", - "from scipy.sparse import csr_matrix\n", "from GraphTsetlinMachine.tm import MultiClassGraphTsetlinMachine\n", "from time import time\n", "import argparse\n", - "import random\n", "import pandas as pd\n", + "import numpy as np\n", "import kagglehub\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.preprocessing import LabelEncoder\n", "\n", + "def default_args(**kwargs):\n", + " parser = argparse.ArgumentParser()\n", + " parser.add_argument(\"--epochs\", default=250, type=int)\n", + " parser.add_argument(\"--number-of-clauses\", default=10000, type=int)\n", + " parser.add_argument(\"--T\", default=10000, type=int)\n", + " parser.add_argument(\"--s\", default=10.0, type=float)\n", + " parser.add_argument(\"--number-of-state-bits\", default=8, type=int)\n", + " parser.add_argument(\"--depth\", default=1, type=int)\n", + " parser.add_argument(\"--hypervector-size\", default=4096, type=int)\n", + " parser.add_argument(\"--hypervector-bits\", default=256, type=int)\n", + " parser.add_argument(\"--message-size\", default=4096, type=int)\n", + " parser.add_argument(\"--message-bits\", default=256, type=int)\n", + " parser.add_argument('--double-hashing', dest='double_hashing', default=False, action='store_true')\n", + " parser.add_argument(\"--noise\", default=0.01, type=float)\n", + " parser.add_argument(\"--max-included-literals\", default=10, type=int)\n", + "\n", + " args = parser.parse_args()\n", + " for key, value in kwargs.items():\n", + " if key in args.__dict__:\n", + " setattr(args, key, value)\n", + " return args\n", + "\n", + "args = default_args()\n", + "\n", + "############################# real dataset ########################\n", "\n", "print(\"Creating training data\")\n", "path = kagglehub.dataset_download(\"karkavelrajaj/amazon-sales-dataset\")\n", "print(\"Path to dataset files:\", path)\n", - "data_file = path + \"/amazon.csv\" # Adjust this path if necessary\n", - "data = pd.read_csv(data_file)\n", + "data_file = path + \"/amazon.csv\" \n", + "org_data = pd.read_csv(data_file)\n", "# print(\"Data preview:\", data.head())\n", - "data = data[['product_id', 'category', 'user_id', 'rating']]\n", - "print(data['category'][100])\n", + "org_data = org_data[['product_id', 'category', 'user_id', 'rating']]\n", + "#################################### expanded \n", + "org_data['rating'] = pd.to_numeric(org_data['rating'], errors='coerce') # Coerce invalid values to NaN\n", + "org_data.dropna(subset=['rating'], inplace=True) # Drop rows with NaN ratings\n", + "org_data['rating'] = org_data['rating'].astype(int)\n", + "# Expand the dataset 10 times\n", + "data = pd.concat([org_data] * 10, ignore_index=True)\n", + "\n", + "# Shuffle the expanded dataset\n", + "data = data.sample(frac=1, random_state=42).reset_index(drop=True)\n", + "\n", + "# Add noise\n", + "# Define the noise ratio\n", + "noise_ratio = 0.1 # 10% noise\n", + "\n", + "# Select rows to apply noise\n", + "num_noisy_rows = int(noise_ratio * len(data))\n", + "noisy_indices = np.random.choice(data.index, size=num_noisy_rows, replace=False)\n", + "\n", + "# Add noise to ratings\n", + "data.loc[noisy_indices, 'rating'] = np.random.choice(range(1, 6), size=num_noisy_rows)\n", + "\n", + "# Add noise to categories\n", + "unique_categories = data['category'].unique()\n", + "data.loc[noisy_indices, 'category'] = np.random.choice(unique_categories, size=num_noisy_rows)\n", + "\n", + "# Print a preview of the noisy and expanded dataset\n", + "print(\"Original data shape:\", org_data.shape)\n", + "print(\"Expanded data shape:\", data.shape)\n", + "print(\"Data preview:\\n\", data.head())\n", + "\n", + "print(data.head())\n", " \n", - "# Step 2: Encode user_id, product_id, and category with LabelEncoder\n", - "# This converts string identifiers into unique integer values\n", "le_user = LabelEncoder()\n", "le_item = LabelEncoder()\n", "le_category = LabelEncoder()\n", + "le_rating = LabelEncoder() \n", "\n", "data['user_id'] = le_user.fit_transform(data['user_id'])\n", "data['product_id'] = le_item.fit_transform(data['product_id'])\n", "data['category'] = le_category.fit_transform(data['category'])\n", + "data['rating'] = le_rating.fit_transform(data['rating'])\n", "\n", - "# Step 3: Prepare X (features) and y (labels)\n", - "x = data[['user_id', 'product_id', 'category']].values # Features: [user, item, category]\n", - "y = data['rating'].values # Labels: rating\n", + "x = data[['user_id', 'product_id', 'category']].values \n", + "y = data['rating'].values \n", "\n", - "# Step 4: Split the data into training and test sets\n", "X_train, X_test, Y_train, Y_test = train_test_split(x, y, test_size=0.2, random_state=42)\n", "\n", - "# Display the shapes to verify the split\n", "print(\"X_train shape:\", X_train.shape)\n", "print(\"y_train shape:\", Y_train.shape)\n", "print(\"X_test shape:\", X_test.shape)\n", @@ -71,9 +143,107 @@ "items = data['product_id'].unique()\n", "categories = data['category'].unique()\n", "\n", - "print(categories[100])\n", - "original_user_id = le_category.inverse_transform([data['category'][100]])[0]\n", - "print(original_user_id)" + "# Initialize Graphs with symbols for GTM\n", + "number_of_nodes = 3\n", + "symbols = []\n", + "symbols = [\"U_\" + str(u) for u in users] + [\"I_\" + str(i) for i in items] + [\"C_\" + str(c) for c in categories] \n", + "print(len(symbols))\n", + "# Train data\n", + "graphs_train = Graphs(\n", + " X_train.shape[0],\n", + " symbols=symbols,\n", + " hypervector_size=args.hypervector_size,\n", + " hypervector_bits=args.hypervector_bits,\n", + " double_hashing = args.double_hashing\n", + ")\n", + "for graph_id in range(X_train.shape[0]):\n", + " graphs_train.set_number_of_graph_nodes(graph_id, number_of_nodes)\n", + "graphs_train.prepare_node_configuration()\n", + "for graph_id in range(X_train.shape[0]):\n", + " for node_id in range(graphs_train.number_of_graph_nodes[graph_id]):\n", + " number_of_edges = 2 if node_id > 0 and node_id < graphs_train.number_of_graph_nodes[graph_id]-1 else 1\n", + " if node_id == 0:\n", + " graphs_train.add_graph_node(graph_id, \"User\", number_of_edges)\n", + " elif node_id == 1:\n", + " graphs_train.add_graph_node(graph_id, \"Item\", number_of_edges)\n", + " else:\n", + " graphs_train.add_graph_node(graph_id, \"Category\", number_of_edges)\n", + "graphs_train.prepare_edge_configuration()\n", + "for graph_id in range(X_train.shape[0]):\n", + " for node_id in range(graphs_train.number_of_graph_nodes[graph_id]):\n", + " if node_id == 0:\n", + " graphs_train.add_graph_node_edge(graph_id, \"User\", \"Item\", \"UserItem\")\n", + " \n", + " if node_id == 1:\n", + " graphs_train.add_graph_node_edge(graph_id, \"Item\", \"Category\", \"ItemCategory\")\n", + " graphs_train.add_graph_node_edge(graph_id, \"Item\", \"User\", \"ItemUser\")\n", + " \n", + " if node_id == 2:\n", + " graphs_train.add_graph_node_edge(graph_id, \"Category\", \"Item\", \"CatrgoryItem\")\n", + "\n", + " graphs_train.add_graph_node_property(graph_id, \"User\", \"U_\" + str(X_train[graph_id][0]))\n", + " graphs_train.add_graph_node_property(graph_id, \"Item\", \"I_\" + str(X_train[graph_id][1]))\n", + " graphs_train.add_graph_node_property(graph_id, \"Category\", \"C_\" + str(X_train[graph_id][2]))\n", + "graphs_train.encode()\n", + "print(\"Training data produced\")\n", + "\n", + "# Test data\n", + "graphs_test = Graphs(X_test.shape[0], init_with=graphs_train)\n", + "for graph_id in range(X_test.shape[0]):\n", + " graphs_test.set_number_of_graph_nodes(graph_id, number_of_nodes)\n", + "graphs_test.prepare_node_configuration()\n", + "for graph_id in range(X_test.shape[0]):\n", + " for node_id in range(graphs_test.number_of_graph_nodes[graph_id]):\n", + " number_of_edges = 2 if node_id > 0 and node_id < graphs_test.number_of_graph_nodes[graph_id]-1 else 1\n", + " if node_id == 0:\n", + " graphs_test.add_graph_node(graph_id, \"User\", number_of_edges)\n", + " elif node_id == 1:\n", + " graphs_test.add_graph_node(graph_id, \"Item\", number_of_edges)\n", + " else:\n", + " graphs_test.add_graph_node(graph_id, \"Category\", number_of_edges)\n", + "graphs_test.prepare_edge_configuration()\n", + "for graph_id in range(X_test.shape[0]):\n", + " for node_id in range(graphs_test.number_of_graph_nodes[graph_id]):\n", + " if node_id == 0:\n", + " graphs_test.add_graph_node_edge(graph_id, \"User\", \"Item\", \"UserItem\")\n", + " \n", + " if node_id == 1:\n", + " graphs_test.add_graph_node_edge(graph_id, \"Item\", \"Category\", \"ItemCategory\")\n", + " graphs_test.add_graph_node_edge(graph_id, \"Item\", \"User\", \"ItemUser\")\n", + " \n", + " if node_id == 2:\n", + " graphs_test.add_graph_node_edge(graph_id, \"Category\", \"Item\", \"CatrgoryItem\")\n", + "\n", + " graphs_test.add_graph_node_property(graph_id, \"User\", \"U_\" + str(X_test[graph_id][0]))\n", + " graphs_test.add_graph_node_property(graph_id, \"Item\", \"I_\" + str(X_test[graph_id][1]))\n", + " graphs_test.add_graph_node_property(graph_id, \"Category\", \"C_\" + str(X_test[graph_id][2]))\n", + "graphs_test.encode()\n", + "print(\"Testing data produced\")\n", + "\n", + "tm = MultiClassGraphTsetlinMachine(\n", + " args.number_of_clauses,\n", + " args.T,\n", + " args.s,\n", + " number_of_state_bits = args.number_of_state_bits,\n", + " depth=args.depth,\n", + " message_size=args.message_size,\n", + " message_bits=args.message_bits,\n", + " max_included_literals=args.max_included_literals,\n", + " double_hashing = args.double_hashing\n", + ")\n", + "\n", + "for i in range(args.epochs):\n", + " start_training = time()\n", + " tm.fit(graphs_train, Y_train, epochs=1, incremental=True)\n", + " stop_training = time()\n", + "\n", + " start_testing = time()\n", + " result_test = 100*(tm.predict(graphs_test) == Y_test).mean()\n", + " stop_testing = time()\n", + "\n", + " result_train = 100*(tm.predict(graphs_train) == Y_train).mean()\n", + "\n", + " print(\"%d %.2f %.2f %.2f %.2f\" % (i, result_train, result_test, stop_training-start_training, stop_testing-start_testing))" ] } ], From 218a96f11d24076ac1076c4d153b280b778700a0 Mon Sep 17 00:00:00 2001 From: Ahmed Khalid Date: Fri, 20 Dec 2024 10:16:33 +0000 Subject: [PATCH 14/22] before add example no --- .../applications/RecommendationSystems.py | 47 ++++++++++--------- examples/applications/test.ipynb | 2 +- 2 files changed, 25 insertions(+), 24 deletions(-) diff --git a/examples/applications/RecommendationSystems.py b/examples/applications/RecommendationSystems.py index 4a1daa4..2b57ecc 100644 --- a/examples/applications/RecommendationSystems.py +++ b/examples/applications/RecommendationSystems.py @@ -10,19 +10,20 @@ def default_args(**kwargs): parser = argparse.ArgumentParser() - parser.add_argument("--epochs", default=250, type=int) + parser.add_argument("--epochs", default=10, type=int) parser.add_argument("--number-of-clauses", default=10000, type=int) parser.add_argument("--T", default=10000, type=int) parser.add_argument("--s", default=10.0, type=float) parser.add_argument("--number-of-state-bits", default=8, type=int) - parser.add_argument("--depth", default=1, type=int) + parser.add_argument("--depth", default=3, type=int) parser.add_argument("--hypervector-size", default=4096, type=int) parser.add_argument("--hypervector-bits", default=256, type=int) - parser.add_argument("--message-size", default=4096, type=int) - parser.add_argument("--message-bits", default=256, type=int) + parser.add_argument("--message-size", default=256, type=int) + parser.add_argument("--message-bits", default=2, type=int) parser.add_argument('--double-hashing', dest='double_hashing', default=False, action='store_true') parser.add_argument("--noise", default=0.01, type=float) parser.add_argument("--max-included-literals", default=10, type=int) + parser.add_argument("--number-of-examples", default=1000, type=int) args = parser.parse_args() for key, value in kwargs.items(): @@ -314,25 +315,25 @@ def default_args(**kwargs): print("%d %.2f %.2f %.2f %.2f" % (i, result_train, result_test, stop_training-start_training, stop_testing-start_testing)) -# weights = tm.get_state()[1].reshape(2, -1) -# for i in range(tm.number_of_clauses): -# print("Clause #%d W:(%d %d)" % (i, weights[0,i], weights[1,i]), end=' ') -# l = [] -# for k in range(args.hypervector_size * 2): -# if tm.ta_action(0, i, k): -# if k < args.hypervector_size: -# l.append("x%d" % (k)) -# else: -# l.append("NOT x%d" % (k - args.hypervector_size)) - -# for k in range(args.message_size * 2): -# if tm.ta_action(1, i, k): -# if k < args.message_size: -# l.append("c%d" % (k)) -# else: -# l.append("NOT c%d" % (k - args.message_size)) - -# print(" AND ".join(l)) +weights = tm.get_state()[1].reshape(2, -1) +for i in range(tm.number_of_clauses): + print("Clause #%d W:(%d %d)" % (i, weights[0,i], weights[1,i]), end=' ') + l = [] + for k in range(args.hypervector_size * 2): + if tm.ta_action(0, i, k): + if k < args.hypervector_size: + l.append("x%d" % (k)) + else: + l.append("NOT x%d" % (k - args.hypervector_size)) + + for k in range(args.message_size * 2): + if tm.ta_action(1, i, k): + if k < args.message_size: + l.append("c%d" % (k)) + else: + l.append("NOT c%d" % (k - args.message_size)) + + print(" AND ".join(l)) # print(graphs_test.hypervectors) # print(tm.hypervectors) diff --git a/examples/applications/test.ipynb b/examples/applications/test.ipynb index 7d389f1..1465bf1 100644 --- a/examples/applications/test.ipynb +++ b/examples/applications/test.ipynb @@ -22,7 +22,7 @@ " [--message-bits MESSAGE_BITS] [--double-hashing]\n", " [--noise NOISE]\n", " [--max-included-literals MAX_INCLUDED_LITERALS]\n", - "ipykernel_launcher.py: error: unrecognized arguments: --f=/root/.local/share/jupyter/runtime/kernel-v3a1695e0e67c01cd0a818bc897e0f886c634ee3d4.json\n" + "ipykernel_launcher.py: error: unrecognized arguments: --f=/root/.local/share/jupyter/runtime/kernel-v306f6e67794e909fd94dbef768cafee2e613728cc.json\n" ] }, { From 799493fd1d3241fb0a6e5271bbaeaecb8c9271cb Mon Sep 17 00:00:00 2001 From: Ahmed Khalid Date: Fri, 20 Dec 2024 11:49:12 +0000 Subject: [PATCH 15/22] orgnizing files --- .../applications/RecommendationSystems.py | 340 ------------------ .../prepare_dataset.cpython-310.pyc | Bin 0 -> 1415 bytes .../recommendation/main_products.py | 178 +++++++++ .../recommendation/prepare_dataset.py | 145 ++++++++ 4 files changed, 323 insertions(+), 340 deletions(-) delete mode 100644 examples/applications/RecommendationSystems.py create mode 100644 examples/applications/recommendation/__pycache__/prepare_dataset.cpython-310.pyc create mode 100644 examples/applications/recommendation/main_products.py create mode 100644 examples/applications/recommendation/prepare_dataset.py diff --git a/examples/applications/RecommendationSystems.py b/examples/applications/RecommendationSystems.py deleted file mode 100644 index 2b57ecc..0000000 --- a/examples/applications/RecommendationSystems.py +++ /dev/null @@ -1,340 +0,0 @@ -from GraphTsetlinMachine.graphs import Graphs -from GraphTsetlinMachine.tm import MultiClassGraphTsetlinMachine -from time import time -import argparse -import pandas as pd -import numpy as np -import kagglehub -from sklearn.model_selection import train_test_split -from sklearn.preprocessing import LabelEncoder - -def default_args(**kwargs): - parser = argparse.ArgumentParser() - parser.add_argument("--epochs", default=10, type=int) - parser.add_argument("--number-of-clauses", default=10000, type=int) - parser.add_argument("--T", default=10000, type=int) - parser.add_argument("--s", default=10.0, type=float) - parser.add_argument("--number-of-state-bits", default=8, type=int) - parser.add_argument("--depth", default=3, type=int) - parser.add_argument("--hypervector-size", default=4096, type=int) - parser.add_argument("--hypervector-bits", default=256, type=int) - parser.add_argument("--message-size", default=256, type=int) - parser.add_argument("--message-bits", default=2, type=int) - parser.add_argument('--double-hashing', dest='double_hashing', default=False, action='store_true') - parser.add_argument("--noise", default=0.01, type=float) - parser.add_argument("--max-included-literals", default=10, type=int) - parser.add_argument("--number-of-examples", default=1000, type=int) - - args = parser.parse_args() - for key, value in kwargs.items(): - if key in args.__dict__: - setattr(args, key, value) - return args - -args = default_args() - -############################# real dataset ######################## - -print("Creating training data") -path = kagglehub.dataset_download("karkavelrajaj/amazon-sales-dataset") -print("Path to dataset files:", path) -data_file = path + "/amazon.csv" -org_data = pd.read_csv(data_file) -# print("Data preview:", data.head()) -org_data = org_data[['product_id', 'category', 'user_id', 'rating']] -#################################### expanded -org_data['rating'] = pd.to_numeric(org_data['rating'], errors='coerce') # Coerce invalid values to NaN -org_data.dropna(subset=['rating'], inplace=True) # Drop rows with NaN ratings -org_data['rating'] = org_data['rating'].astype(int) -# Expand the dataset 10 times -data = pd.concat([org_data] * 10, ignore_index=True) - -# Shuffle the expanded dataset -data = data.sample(frac=1, random_state=42).reset_index(drop=True) - -# Add noise -# Define the noise ratio -noise_ratio = 0.1 # 10% noise - -# Select rows to apply noise -num_noisy_rows = int(noise_ratio * len(data)) -noisy_indices = np.random.choice(data.index, size=num_noisy_rows, replace=False) - -# Add noise to ratings -data.loc[noisy_indices, 'rating'] = np.random.choice(range(1, 6), size=num_noisy_rows) - -# Add noise to categories -unique_categories = data['category'].unique() -data.loc[noisy_indices, 'category'] = np.random.choice(unique_categories, size=num_noisy_rows) - -# Print a preview of the noisy and expanded dataset -print("Original data shape:", org_data.shape) -print("Expanded data shape:", data.shape) -print("Data preview:\n", data.head()) -############################# artificial dataset ######################## - -# Set random seed for reproducibility -# np.random.seed(42) - -########################## ver 1 ############################ - -# num_users = 5 # Number of unique users -# num_items =10 # Number of unique items -# num_categories = 5 # Number of unique categories -# num_interactions = 1000 # Number of user-item interactions -# # Generate random ratings (e.g., between 1 and 5) -# ratings = np.random.choice(range(1, 3), num_interactions) -# # Generate random user-item interactions -# user_ids = np.random.choice(range(num_users), num_interactions) -# item_ids = np.random.choice(range(num_items), num_interactions) -# categories = np.random.choice(range(num_categories), num_interactions) - -# data = pd.DataFrame({ -# 'user_id': user_ids, -# 'product_id': item_ids, -# 'category': categories, -# 'rating': ratings -# }) -# print("Artificial Dataset Preview:") - -########################## ver 2 ############################ - -# Parameters -# num_users = 100 # Number of unique users -# num_items = 50 # Number of unique items -# num_categories = 50 # Number of unique categories -# num_interactions = 1000 # Number of user-item interactions -# noise_ratio = 0.01 # Percentage of noisy interactions - -# # Generate user preferences: each user prefers 1-3 random categories -# user_preferences = { -# user: np.random.choice(range(num_categories), size=np.random.randint(1, 4), replace=False) -# for user in range(num_users) -# } - -# # Assign each item to a category -# item_categories = {item: np.random.choice(range(num_categories)) for item in range(num_items)} - -# # Generate interactions -# user_ids = np.random.choice(range(num_users), num_interactions) -# item_ids = np.random.choice(range(num_items), num_interactions) - -# # Generate ratings based on the pattern -# ratings = [] -# for user, item in zip(user_ids, item_ids): -# item_category = item_categories[item] -# if item_category in user_preferences[user]: -# ratings.append(np.random.choice([3, 4])) # High rating for preferred categories -# else: -# ratings.append(np.random.choice([1, 2])) # Low rating otherwise - -# # Introduce noise -# num_noisy = int(noise_ratio * num_interactions) -# noisy_indices = np.random.choice(range(num_interactions), num_noisy, replace=False) -# for idx in noisy_indices: -# ratings[idx] = np.random.choice(range(1, 6)) # Replace with random rating - -# # Combine into a DataFrame -# data = pd.DataFrame({ -# 'user_id': user_ids, -# 'product_id': item_ids, -# 'category': [item_categories[item] for item in item_ids], -# 'rating': ratings -# }) -# print("Artificial Dataset Preview:") - -########################### ver 3 ############################## - -# Parameters -# num_users = 100 # Number of unique users -# num_items = 50 # Number of unique items -# num_categories = 5 # Number of unique categories -# num_interactions = 10000 # Number of user-item interactions -# noise_ratio = 0.01 # Percentage of noisy interactions - -# # Step 1: Define deterministic user preferences -# user_preferences = {user: user % num_categories for user in range(num_users)} - -# # Step 2: Assign items to categories in a cyclic pattern -# item_categories = {item: item % num_categories for item in range(num_items)} - -# # Step 3: Generate deterministic interactions -# user_ids = np.arange(num_interactions) % num_users # Cycle through users -# item_ids = np.arange(num_interactions) % num_items # Cycle through items - -# # Step 4: Generate ratings based on the pattern -# ratings = [] -# for user, item in zip(user_ids, item_ids): -# preferred_category = user_preferences[user] -# item_category = item_categories[item] -# if item_category == preferred_category: -# ratings.append(5) # High rating for preferred category -# else: -# ratings.append(1) # Low rating otherwise - -# # Step 5: Introduce noise -# num_noisy = int(noise_ratio * num_interactions) -# noisy_indices = np.random.choice(range(num_interactions), num_noisy, replace=False) -# for idx in noisy_indices: -# ratings[idx] = np.random.choice(range(1, 6)) # Replace with random rating - -# # Step 6: Create a DataFrame -# data = pd.DataFrame({ -# 'user_id': user_ids, -# 'product_id': item_ids, -# 'category': [item_categories[item] for item in item_ids], -# 'rating': ratings -# }) - -######################################################################## -print(data.head()) - -le_user = LabelEncoder() -le_item = LabelEncoder() -le_category = LabelEncoder() -le_rating = LabelEncoder() - -data['user_id'] = le_user.fit_transform(data['user_id']) -data['product_id'] = le_item.fit_transform(data['product_id']) -data['category'] = le_category.fit_transform(data['category']) -data['rating'] = le_rating.fit_transform(data['rating']) - -x = data[['user_id', 'product_id', 'category']].values -y = data['rating'].values - -X_train, X_test, Y_train, Y_test = train_test_split(x, y, test_size=0.2, random_state=42) - -print("X_train shape:", X_train.shape) -print("y_train shape:", Y_train.shape) -print("X_test shape:", X_test.shape) -print("y_test shape:", Y_test.shape) - -users = data['user_id'].unique() -items = data['product_id'].unique() -categories = data['category'].unique() - -# Initialize Graphs with symbols for GTM -number_of_nodes = 3 -symbols = [] -symbols = ["U_" + str(u) for u in users] + ["I_" + str(i) for i in items] + ["C_" + str(c) for c in categories] -print(len(symbols)) -# Train data -graphs_train = Graphs( - X_train.shape[0], - symbols=symbols, - hypervector_size=args.hypervector_size, - hypervector_bits=args.hypervector_bits, - double_hashing = args.double_hashing -) -for graph_id in range(X_train.shape[0]): - graphs_train.set_number_of_graph_nodes(graph_id, number_of_nodes) -graphs_train.prepare_node_configuration() -for graph_id in range(X_train.shape[0]): - for node_id in range(graphs_train.number_of_graph_nodes[graph_id]): - number_of_edges = 2 if node_id > 0 and node_id < graphs_train.number_of_graph_nodes[graph_id]-1 else 1 - if node_id == 0: - graphs_train.add_graph_node(graph_id, "User", number_of_edges) - elif node_id == 1: - graphs_train.add_graph_node(graph_id, "Item", number_of_edges) - else: - graphs_train.add_graph_node(graph_id, "Category", number_of_edges) -graphs_train.prepare_edge_configuration() -for graph_id in range(X_train.shape[0]): - for node_id in range(graphs_train.number_of_graph_nodes[graph_id]): - if node_id == 0: - graphs_train.add_graph_node_edge(graph_id, "User", "Item", "UserItem") - - if node_id == 1: - graphs_train.add_graph_node_edge(graph_id, "Item", "Category", "ItemCategory") - graphs_train.add_graph_node_edge(graph_id, "Item", "User", "ItemUser") - - if node_id == 2: - graphs_train.add_graph_node_edge(graph_id, "Category", "Item", "CatrgoryItem") - - graphs_train.add_graph_node_property(graph_id, "User", "U_" + str(X_train[graph_id][0])) - graphs_train.add_graph_node_property(graph_id, "Item", "I_" + str(X_train[graph_id][1])) - graphs_train.add_graph_node_property(graph_id, "Category", "C_" + str(X_train[graph_id][2])) -graphs_train.encode() -print("Training data produced") - -# Test data -graphs_test = Graphs(X_test.shape[0], init_with=graphs_train) -for graph_id in range(X_test.shape[0]): - graphs_test.set_number_of_graph_nodes(graph_id, number_of_nodes) -graphs_test.prepare_node_configuration() -for graph_id in range(X_test.shape[0]): - for node_id in range(graphs_test.number_of_graph_nodes[graph_id]): - number_of_edges = 2 if node_id > 0 and node_id < graphs_test.number_of_graph_nodes[graph_id]-1 else 1 - if node_id == 0: - graphs_test.add_graph_node(graph_id, "User", number_of_edges) - elif node_id == 1: - graphs_test.add_graph_node(graph_id, "Item", number_of_edges) - else: - graphs_test.add_graph_node(graph_id, "Category", number_of_edges) -graphs_test.prepare_edge_configuration() -for graph_id in range(X_test.shape[0]): - for node_id in range(graphs_test.number_of_graph_nodes[graph_id]): - if node_id == 0: - graphs_test.add_graph_node_edge(graph_id, "User", "Item", "UserItem") - - if node_id == 1: - graphs_test.add_graph_node_edge(graph_id, "Item", "Category", "ItemCategory") - graphs_test.add_graph_node_edge(graph_id, "Item", "User", "ItemUser") - - if node_id == 2: - graphs_test.add_graph_node_edge(graph_id, "Category", "Item", "CatrgoryItem") - - graphs_test.add_graph_node_property(graph_id, "User", "U_" + str(X_test[graph_id][0])) - graphs_test.add_graph_node_property(graph_id, "Item", "I_" + str(X_test[graph_id][1])) - graphs_test.add_graph_node_property(graph_id, "Category", "C_" + str(X_test[graph_id][2])) -graphs_test.encode() -print("Testing data produced") - -tm = MultiClassGraphTsetlinMachine( - args.number_of_clauses, - args.T, - args.s, - number_of_state_bits = args.number_of_state_bits, - depth=args.depth, - message_size=args.message_size, - message_bits=args.message_bits, - max_included_literals=args.max_included_literals, - double_hashing = args.double_hashing -) - -for i in range(args.epochs): - start_training = time() - tm.fit(graphs_train, Y_train, epochs=1, incremental=True) - stop_training = time() - - start_testing = time() - result_test = 100*(tm.predict(graphs_test) == Y_test).mean() - stop_testing = time() - - result_train = 100*(tm.predict(graphs_train) == Y_train).mean() - - print("%d %.2f %.2f %.2f %.2f" % (i, result_train, result_test, stop_training-start_training, stop_testing-start_testing)) - -weights = tm.get_state()[1].reshape(2, -1) -for i in range(tm.number_of_clauses): - print("Clause #%d W:(%d %d)" % (i, weights[0,i], weights[1,i]), end=' ') - l = [] - for k in range(args.hypervector_size * 2): - if tm.ta_action(0, i, k): - if k < args.hypervector_size: - l.append("x%d" % (k)) - else: - l.append("NOT x%d" % (k - args.hypervector_size)) - - for k in range(args.message_size * 2): - if tm.ta_action(1, i, k): - if k < args.message_size: - l.append("c%d" % (k)) - else: - l.append("NOT c%d" % (k - args.message_size)) - - print(" AND ".join(l)) - -# print(graphs_test.hypervectors) -# print(tm.hypervectors) -# print(graphs_test.edge_type_id) \ No newline at end of file diff --git a/examples/applications/recommendation/__pycache__/prepare_dataset.cpython-310.pyc b/examples/applications/recommendation/__pycache__/prepare_dataset.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..334918248b0038345364fc10337498871cad89c2 GIT binary patch literal 1415 zcmZ8hOK&SR7_~i*q)D2lZBzsjC}Kqr_jbjmssQQ2T^1yekh&T9j(w+-PR4`n^bsYi zvds^GRm+ZFfcPukvg)1{E6z75QZKf~kI%>7^LWheZjYh;@uMy-Q^x+H&GRO(`3j%= z41;2dSL~d(vp5&+9G%BXsOV>Qo~T$Qm{Y~SXVdf%H`rND<9{u}g1%Sy+-nRT8t*t~ zp5F^49{8Fh(Td#(&Q?st@3A%a(S5w)I1@NaR$`X4r=nmg{f)oRV5@VsJbGod^krR4 zyp?553njf=etaqIrM%LWl`}azmUFo@^$RB}?OqVTX}>)BPWp?9H|-4wCT~j|pDg=N z@Rzx}T8_W9Wl`3$YEct+Asc-%jYF@oMlEuml`3>|>2+c3ZRjkVwj@ez>(w+3Nn^{} zhn-7V6qUYMya}Tx2Qp=@>&i$KL<1!VL}f4;lGeyH3ULDm!w$hS@-1|YEi&5v&g>rY z_|NIFY?@rM|IVX>TSfV3u%AD*IW%i2av(4<1 zA|BRNr`sR^C`1mQEh!DTEG7keQYh~t{^WYTlv9hKqe*J~opI^QvoGX_)bWqM#FV$ko z?T45wJ+kHao0|qYw0h3iGdfsI9Vy*Pe!$1#NW{@djQKb3mq@kWxOSYm;R= qq%?t7&8__e>unwh`OGpIu&w#-=^X!wN`FD6c0__Mo)@|p^!^25{fROF literal 0 HcmV?d00001 diff --git a/examples/applications/recommendation/main_products.py b/examples/applications/recommendation/main_products.py new file mode 100644 index 0000000..03d5b64 --- /dev/null +++ b/examples/applications/recommendation/main_products.py @@ -0,0 +1,178 @@ +from GraphTsetlinMachine.graphs import Graphs +from GraphTsetlinMachine.tm import MultiClassGraphTsetlinMachine +from time import time +import argparse +import pandas as pd +import numpy as np +from sklearn.model_selection import train_test_split +from sklearn.preprocessing import LabelEncoder +import prepare_dataset + +def default_args(**kwargs): + parser = argparse.ArgumentParser() + parser.add_argument("--epochs", default=10, type=int) + parser.add_argument("--number-of-clauses", default=1000, type=int) + parser.add_argument("--T", default=10000, type=int) + parser.add_argument("--s", default=10.0, type=float) + parser.add_argument("--number-of-state-bits", default=8, type=int) + parser.add_argument("--depth", default=3, type=int) + parser.add_argument("--hypervector-size", default=4096, type=int) + parser.add_argument("--hypervector-bits", default=256, type=int) + parser.add_argument("--message-size", default=256, type=int) + parser.add_argument("--message-bits", default=2, type=int) + parser.add_argument('--double-hashing', dest='double_hashing', default=False, action='store_true') + parser.add_argument("--noise", default=0.01, type=float) + parser.add_argument("--max-included-literals", default=10, type=int) + + args = parser.parse_args() + for key, value in kwargs.items(): + if key in args.__dict__: + setattr(args, key, value) + return args +args = default_args() + +data = prepare_dataset.aug_amazon_products() +print(data.head()) +le_user = LabelEncoder() +le_item = LabelEncoder() +le_category = LabelEncoder() +le_rating = LabelEncoder() +data['user_id'] = le_user.fit_transform(data['user_id']) +data['product_id'] = le_item.fit_transform(data['product_id']) +data['category'] = le_category.fit_transform(data['category']) +data['rating'] = le_rating.fit_transform(data['rating']) +x = data[['user_id', 'product_id', 'category']].values +y = data['rating'].values +X_train, X_test, Y_train, Y_test = train_test_split(x, y, test_size=0.2, random_state=42) +print("X_train shape:", X_train.shape) +print("y_train shape:", Y_train.shape) +print("X_test shape:", X_test.shape) +print("y_test shape:", Y_test.shape) +users = data['user_id'].unique() +items = data['product_id'].unique() +categories = data['category'].unique() +# Initialize Graphs with symbols for GTM +number_of_nodes = 3 +symbols = [] +symbols = ["U_" + str(u) for u in users] + ["I_" + str(i) for i in items] + ["C_" + str(c) for c in categories] +print("Symbols: ",len(symbols)) + +# Train data +graphs_train = Graphs( + X_train.shape[0], + symbols=symbols, + hypervector_size=args.hypervector_size, + hypervector_bits=args.hypervector_bits, + double_hashing = args.double_hashing +) +for graph_id in range(X_train.shape[0]): + graphs_train.set_number_of_graph_nodes(graph_id, number_of_nodes) +graphs_train.prepare_node_configuration() +for graph_id in range(X_train.shape[0]): + for node_id in range(graphs_train.number_of_graph_nodes[graph_id]): + number_of_edges = 2 if node_id > 0 and node_id < graphs_train.number_of_graph_nodes[graph_id]-1 else 1 + if node_id == 0: + graphs_train.add_graph_node(graph_id, "User", number_of_edges) + elif node_id == 1: + graphs_train.add_graph_node(graph_id, "Item", number_of_edges) + else: + graphs_train.add_graph_node(graph_id, "Category", number_of_edges) +graphs_train.prepare_edge_configuration() +for graph_id in range(X_train.shape[0]): + for node_id in range(graphs_train.number_of_graph_nodes[graph_id]): + if node_id == 0: + graphs_train.add_graph_node_edge(graph_id, "User", "Item", "UserItem") + + if node_id == 1: + graphs_train.add_graph_node_edge(graph_id, "Item", "Category", "ItemCategory") + graphs_train.add_graph_node_edge(graph_id, "Item", "User", "ItemUser") + + if node_id == 2: + graphs_train.add_graph_node_edge(graph_id, "Category", "Item", "CatrgoryItem") + + graphs_train.add_graph_node_property(graph_id, "User", "U_" + str(X_train[graph_id][0])) + graphs_train.add_graph_node_property(graph_id, "Item", "I_" + str(X_train[graph_id][1])) + graphs_train.add_graph_node_property(graph_id, "Category", "C_" + str(X_train[graph_id][2])) +graphs_train.encode() +print("Training data produced") + +# Test data +graphs_test = Graphs(X_test.shape[0], init_with=graphs_train) +for graph_id in range(X_test.shape[0]): + graphs_test.set_number_of_graph_nodes(graph_id, number_of_nodes) +graphs_test.prepare_node_configuration() +for graph_id in range(X_test.shape[0]): + for node_id in range(graphs_test.number_of_graph_nodes[graph_id]): + number_of_edges = 2 if node_id > 0 and node_id < graphs_test.number_of_graph_nodes[graph_id]-1 else 1 + if node_id == 0: + graphs_test.add_graph_node(graph_id, "User", number_of_edges) + elif node_id == 1: + graphs_test.add_graph_node(graph_id, "Item", number_of_edges) + else: + graphs_test.add_graph_node(graph_id, "Category", number_of_edges) +graphs_test.prepare_edge_configuration() +for graph_id in range(X_test.shape[0]): + for node_id in range(graphs_test.number_of_graph_nodes[graph_id]): + if node_id == 0: + graphs_test.add_graph_node_edge(graph_id, "User", "Item", "UserItem") + + if node_id == 1: + graphs_test.add_graph_node_edge(graph_id, "Item", "Category", "ItemCategory") + graphs_test.add_graph_node_edge(graph_id, "Item", "User", "ItemUser") + + if node_id == 2: + graphs_test.add_graph_node_edge(graph_id, "Category", "Item", "CatrgoryItem") + + graphs_test.add_graph_node_property(graph_id, "User", "U_" + str(X_test[graph_id][0])) + graphs_test.add_graph_node_property(graph_id, "Item", "I_" + str(X_test[graph_id][1])) + graphs_test.add_graph_node_property(graph_id, "Category", "C_" + str(X_test[graph_id][2])) +graphs_test.encode() +print("Testing data produced") + +tm = MultiClassGraphTsetlinMachine( + args.number_of_clauses, + args.T, + args.s, + number_of_state_bits = args.number_of_state_bits, + depth=args.depth, + message_size=args.message_size, + message_bits=args.message_bits, + max_included_literals=args.max_included_literals, + double_hashing = args.double_hashing +) + +for i in range(args.epochs): + start_training = time() + tm.fit(graphs_train, Y_train, epochs=1, incremental=True) + stop_training = time() + + start_testing = time() + result_test = 100*(tm.predict(graphs_test) == Y_test).mean() + stop_testing = time() + + result_train = 100*(tm.predict(graphs_train) == Y_train).mean() + print("%d %.2f %.2f %.2f %.2f" % (i, result_train, result_test, stop_training-start_training, stop_testing-start_testing)) + +# weights = tm.get_state()[1].reshape(2, -1) +# for i in range(tm.number_of_clauses): +# print("Clause #%d W:(%d %d)" % (i, weights[0,i], weights[1,i]), end=' ') +# l = [] +# for k in range(args.hypervector_size * 2): +# if tm.ta_action(0, i, k): +# if k < args.hypervector_size: +# l.append("x%d" % (k)) +# else: +# l.append("NOT x%d" % (k - args.hypervector_size)) + +# for k in range(args.message_size * 2): +# if tm.ta_action(1, i, k): +# if k < args.message_size: +# l.append("c%d" % (k)) +# else: +# l.append("NOT c%d" % (k - args.message_size)) + +# print(" AND ".join(l)) + +# print(graphs_test.hypervectors) +# print(tm.hypervectors) +# print(graphs_test.edge_type_id) \ No newline at end of file diff --git a/examples/applications/recommendation/prepare_dataset.py b/examples/applications/recommendation/prepare_dataset.py new file mode 100644 index 0000000..582b569 --- /dev/null +++ b/examples/applications/recommendation/prepare_dataset.py @@ -0,0 +1,145 @@ +import pandas as pd +import kagglehub +import numpy as np + +np.random.seed(42) + +def amazon_products(): + print("Creating training data") + path = kagglehub.dataset_download("karkavelrajaj/amazon-sales-dataset") + print("Path to dataset files:", path) + data_file = path + "/amazon.csv" + org_data = pd.read_csv(data_file) + print("Original data shape:", org_data.shape) + return org_data[['product_id', 'category', 'user_id', 'rating']] + +def aug_amazon_products(): + org_data = amazon_products() + org_data['rating'] = pd.to_numeric(org_data['rating'], errors='coerce') # Coerce invalid values to NaN + org_data.dropna(subset=['rating'], inplace=True) # Drop rows with NaN ratings + org_data['rating'] = org_data['rating'].astype(int) + # Expand the dataset 10 times + data = pd.concat([org_data] * 10, ignore_index=True) + # Shuffle the expanded dataset + data = data.sample(frac=1, random_state=42).reset_index(drop=True) + # Add noise + # Define the noise ratio + noise_ratio = 0.1 # 10% noise + # Select rows to apply noise + num_noisy_rows = int(noise_ratio * len(data)) + noisy_indices = np.random.choice(data.index, size=num_noisy_rows, replace=False) + # Add noise to ratings + data.loc[noisy_indices, 'rating'] = np.random.choice(range(1, 6), size=num_noisy_rows) + # Add noise to categories + unique_categories = data['category'].unique() + data.loc[noisy_indices, 'category'] = np.random.choice(unique_categories, size=num_noisy_rows) + # Print a preview of the noisy and expanded dataset + print("Expanded data shape:", data.shape) + print("Data preview:\n", data.head()) + return data + +def artificial(): + num_users = 5 # Number of unique users + num_items =10 # Number of unique items + num_categories = 5 # Number of unique categories + num_interactions = 1000 # Number of user-item interactions + # Generate random ratings (e.g., between 1 and 5) + ratings = np.random.choice(range(1, 3), num_interactions) + # Generate random user-item interactions + user_ids = np.random.choice(range(num_users), num_interactions) + item_ids = np.random.choice(range(num_items), num_interactions) + categories = np.random.choice(range(num_categories), num_interactions) + + data = pd.DataFrame({ + 'user_id': user_ids, + 'product_id': item_ids, + 'category': categories, + 'rating': ratings + }) + return data + +def artificial_with_user_pref(): + num_users = 100 # Number of unique users + num_items = 50 # Number of unique items + num_categories = 50 # Number of unique categories + num_interactions = 1000 # Number of user-item interactions + noise_ratio = 0.01 # Percentage of noisy interactions + + # Generate user preferences: each user prefers 1-3 random categories + user_preferences = { + user: np.random.choice(range(num_categories), size=np.random.randint(1, 4), replace=False) + for user in range(num_users) + } + + # Assign each item to a category + item_categories = {item: np.random.choice(range(num_categories)) for item in range(num_items)} + + # Generate interactions + user_ids = np.random.choice(range(num_users), num_interactions) + item_ids = np.random.choice(range(num_items), num_interactions) + + # Generate ratings based on the pattern + ratings = [] + for user, item in zip(user_ids, item_ids): + item_category = item_categories[item] + if item_category in user_preferences[user]: + ratings.append(np.random.choice([3, 4])) # High rating for preferred categories + else: + ratings.append(np.random.choice([1, 2])) # Low rating otherwise + + # Introduce noise + num_noisy = int(noise_ratio * num_interactions) + noisy_indices = np.random.choice(range(num_interactions), num_noisy, replace=False) + for idx in noisy_indices: + ratings[idx] = np.random.choice(range(1, 6)) # Replace with random rating + + # Combine into a DataFrame + data = pd.DataFrame({ + 'user_id': user_ids, + 'product_id': item_ids, + 'category': [item_categories[item] for item in item_ids], + 'rating': ratings + }) + return data + +def artificial_pattered(): + num_users = 100 # Number of unique users + num_items = 50 # Number of unique items + num_categories = 5 # Number of unique categories + num_interactions = 10000 # Number of user-item interactions + noise_ratio = 0.01 # Percentage of noisy interactions + + # Step 1: Define deterministic user preferences + user_preferences = {user: user % num_categories for user in range(num_users)} + + # Step 2: Assign items to categories in a cyclic pattern + item_categories = {item: item % num_categories for item in range(num_items)} + + # Step 3: Generate deterministic interactions + user_ids = np.arange(num_interactions) % num_users # Cycle through users + item_ids = np.arange(num_interactions) % num_items # Cycle through items + + # Step 4: Generate ratings based on the pattern + ratings = [] + for user, item in zip(user_ids, item_ids): + preferred_category = user_preferences[user] + item_category = item_categories[item] + if item_category == preferred_category: + ratings.append(5) # High rating for preferred category + else: + ratings.append(1) # Low rating otherwise + + # Step 5: Introduce noise + num_noisy = int(noise_ratio * num_interactions) + noisy_indices = np.random.choice(range(num_interactions), num_noisy, replace=False) + for idx in noisy_indices: + ratings[idx] = np.random.choice(range(1, 6)) # Replace with random rating + + # Step 6: Create a DataFrame + data = pd.DataFrame({ + 'user_id': user_ids, + 'product_id': item_ids, + 'category': [item_categories[item] for item in item_ids], + 'rating': ratings + }) + return data \ No newline at end of file From 801b7e399fd3050920f2b1d988dad94ac6925c88 Mon Sep 17 00:00:00 2001 From: Ahmed Khalid Date: Fri, 20 Dec 2024 13:27:26 +0000 Subject: [PATCH 16/22] update --- .../__pycache__/prepare_dataset.cpython-310.pyc | Bin 0 -> 4044 bytes .../main.py} | 13 +++++++++---- .../prepare_dataset.py | 8 +++++--- .../{ => products_recommendation}/test.ipynb | 0 .../__pycache__/prepare_dataset.cpython-310.pyc | Bin 1415 -> 0 bytes 5 files changed, 14 insertions(+), 7 deletions(-) create mode 100644 examples/applications/products_recommendation/__pycache__/prepare_dataset.cpython-310.pyc rename examples/applications/{recommendation/main_products.py => products_recommendation/main.py} (94%) rename examples/applications/{recommendation => products_recommendation}/prepare_dataset.py (97%) rename examples/applications/{ => products_recommendation}/test.ipynb (100%) delete mode 100644 examples/applications/recommendation/__pycache__/prepare_dataset.cpython-310.pyc diff --git a/examples/applications/products_recommendation/__pycache__/prepare_dataset.cpython-310.pyc b/examples/applications/products_recommendation/__pycache__/prepare_dataset.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d6947f000940c5e0a9189502c8ece11503498ecf GIT binary patch literal 4044 zcma)9&2uA16`$_;&}j5w*%rzgieLg%5LLFRP!zB+yMf(xQhay=36Pj9Vxn%#qm^g0 zJ>y+ljJSBKk{nzGl`99En=ko;`bKd8U*SekNq(*tXIX!zm)Q@;%PV-&yAZ+>Y+!9M^UJrm`L(wk!G->T>N^%6D9nX zgfGhY^+g~m_$>?ehSjc~VJ2&{OY;0HN`_e!_m(piMlm%+n1!Pk9){{+ z_*f29*bn<_;chre;*~TU$aIAaQkjifuZP*ra+a78P%Ljp=)O6sOw3oi>EluJzKVKL z91e`ta=H@^<;}LEgP}^qUN`GRLi^n?lf6V8=+a&)6{$RBylQ*e9jYkKbopV}>kZ`2 z-n+UnaiAlTC-EQ&h2}#{f6{PoC{c9? zp9T1mtm3QYp}Le=GIa5n%>E0F%<5B4kvC-5KD3Zor|%-CdyV9Y@}pmaeLc5DP1KLIm|G3$f0k*8`f{+K=?NK#Kb&&m_M<`?qs}s$!eag zvIVh#QHutRg!skHvu`1CMf;$=ZX#7zwwd;%Qi)36Y;*0Ud+%bWbt#I6gRm>ls3>7l zje2o{OhmDe`)7=lTszw;?CPotW0CB3(hSgosUuV}{Kp@fAHDjw(bL--r?5+-k<=w5 z(@aU3XfyQ{{3xomQS;vZ5OYXz3Ajzo;~}AwX2%>-3X@e;#uy!BNhjXhl`86Lk5-IB z?S*M}FqGOR6eGMC_(DVmak`=;0xJU0_CUt4*h!+U)UF9iyI84*!Gi=d?#0ogJ*mi5 zRe@-Gia>_o0BK!`lPHxPDx5^u;YNoP2OX6>Np-DgV^U0>>iJ@-&ZO`nnVNbiM1$sB z4EK7SXJAbwlRghoWgcs>23ufF-r|nk;7zv7tNe?s${nV@2a`)Mr_CwhF);rl8nZAL zd<PF$z06IfY%$k=!Dc zq@h%j#_L&QB#%;e$nRPYzVQwp`{VNaR$gAR#z7wBPVO&Rk1h3fUOFxk?|_wp9?n8y z&PTUr^c7K^>a$XA=iG2y#c&j@OMt0NSgx8^v0`mpGcluAREX`ss-?WPZav!1{Nplt z_MU~98)E*cpL-NpUJ?riV)eXkSgnAVz#g8NRQ&)g^<{`IG1d8pAV%O>#FMT20q#25CF{ULwEv5r%qmf*Ufs5aZ~3`#EmO0Yuh#~GN9uYOxQ z1g=x2ZezqOe%8Sc8IHT*tqt^~K7?m+zWj;z$!KK?uFjJv+cD_InJ2eb*TGNWAicFZ z9sijq2~j3W^3p_EIgj#q66FaiqB0&Kv}w6r1aivNPcdFQR1@mwXinW%n|S>ho|Ja= zc}v}-?$=0|(kAT&f?g)gS0O&*BW)x3`0pk}k22wn8E}XD(x1(K7mJyyyAP97rsqs? z&eWypG#nbN52Cc#O-Yl|`c3Qe?c>Er^)g1agPFZB)UyVy;U8^}BAiILy5NSQsOk+F zsdXAQr+!F^TO?i~@f`@=Fw8=UlS;-o57dvT?+%F{k)S;)w+(j91xca;_@cMg^ReMD3sX@ag`_98REi)nux$^nG4S3jp6NC z!unD{SOraug9&j}@=6{&w z&;fe!*P%8cr#i@~pOue;3v#N9xfyb57;>7+=Rg|s#rdbG#?`ANz7G*KUPS9khN6Sg z)J2Qzs4hxWyE$s3WzE!VQ64{`8LueK6nawxEo=1R3^H&{#{nqC=c18wSJdxm6;o_R z+`neg$D|KQ5C?3iUsCgb7QcFfOrDwf%?llYO#K?-Pt^Dg#QzKCnOd$~Rm-SgbJbIC z(X4NiFtu!mj}C~Upc_{ewA!NnYioF+sB~=b`F-y|{gy_&LxSp9Js>f?Q2dTwO#v@n zk)}R+Qv%Fv>9*{aUAOC#vh_{rRjU)wMHOa2$CE1v?NWTI;T literal 0 HcmV?d00001 diff --git a/examples/applications/recommendation/main_products.py b/examples/applications/products_recommendation/main.py similarity index 94% rename from examples/applications/recommendation/main_products.py rename to examples/applications/products_recommendation/main.py index 03d5b64..e045607 100644 --- a/examples/applications/recommendation/main_products.py +++ b/examples/applications/products_recommendation/main.py @@ -10,19 +10,19 @@ def default_args(**kwargs): parser = argparse.ArgumentParser() - parser.add_argument("--epochs", default=10, type=int) + parser.add_argument("--epochs", default=100, type=int) parser.add_argument("--number-of-clauses", default=1000, type=int) parser.add_argument("--T", default=10000, type=int) parser.add_argument("--s", default=10.0, type=float) parser.add_argument("--number-of-state-bits", default=8, type=int) - parser.add_argument("--depth", default=3, type=int) + parser.add_argument("--depth", default=1, type=int) parser.add_argument("--hypervector-size", default=4096, type=int) parser.add_argument("--hypervector-bits", default=256, type=int) parser.add_argument("--message-size", default=256, type=int) parser.add_argument("--message-bits", default=2, type=int) parser.add_argument('--double-hashing', dest='double_hashing', default=False, action='store_true') parser.add_argument("--noise", default=0.01, type=float) - parser.add_argument("--max-included-literals", default=10, type=int) + parser.add_argument("--max-included-literals", default=3, type=int) args = parser.parse_args() for key, value in kwargs.items(): @@ -30,9 +30,14 @@ def default_args(**kwargs): setattr(args, key, value) return args args = default_args() +np.random.seed(42) +# data = prepare_dataset.amazon_products() data = prepare_dataset.aug_amazon_products() -print(data.head()) +# data = prepare_dataset.artificial() +# data = prepare_dataset.artificial_with_user_pref() +# data = prepare_dataset.artificial_pattered() +# print(data.head()) le_user = LabelEncoder() le_item = LabelEncoder() le_category = LabelEncoder() diff --git a/examples/applications/recommendation/prepare_dataset.py b/examples/applications/products_recommendation/prepare_dataset.py similarity index 97% rename from examples/applications/recommendation/prepare_dataset.py rename to examples/applications/products_recommendation/prepare_dataset.py index 582b569..20162f0 100644 --- a/examples/applications/recommendation/prepare_dataset.py +++ b/examples/applications/products_recommendation/prepare_dataset.py @@ -2,7 +2,6 @@ import kagglehub import numpy as np -np.random.seed(42) def amazon_products(): print("Creating training data") @@ -14,6 +13,7 @@ def amazon_products(): return org_data[['product_id', 'category', 'user_id', 'rating']] def aug_amazon_products(): + np.random.seed(42) org_data = amazon_products() org_data['rating'] = pd.to_numeric(org_data['rating'], errors='coerce') # Coerce invalid values to NaN org_data.dropna(subset=['rating'], inplace=True) # Drop rows with NaN ratings @@ -24,7 +24,7 @@ def aug_amazon_products(): data = data.sample(frac=1, random_state=42).reset_index(drop=True) # Add noise # Define the noise ratio - noise_ratio = 0.1 # 10% noise + noise_ratio = 0.01 # 10% noise # Select rows to apply noise num_noisy_rows = int(noise_ratio * len(data)) noisy_indices = np.random.choice(data.index, size=num_noisy_rows, replace=False) @@ -35,10 +35,10 @@ def aug_amazon_products(): data.loc[noisy_indices, 'category'] = np.random.choice(unique_categories, size=num_noisy_rows) # Print a preview of the noisy and expanded dataset print("Expanded data shape:", data.shape) - print("Data preview:\n", data.head()) return data def artificial(): + np.random.seed(42) num_users = 5 # Number of unique users num_items =10 # Number of unique items num_categories = 5 # Number of unique categories @@ -59,6 +59,7 @@ def artificial(): return data def artificial_with_user_pref(): + np.random.seed(42) num_users = 100 # Number of unique users num_items = 50 # Number of unique items num_categories = 50 # Number of unique categories @@ -103,6 +104,7 @@ def artificial_with_user_pref(): return data def artificial_pattered(): + np.random.seed(42) num_users = 100 # Number of unique users num_items = 50 # Number of unique items num_categories = 5 # Number of unique categories diff --git a/examples/applications/test.ipynb b/examples/applications/products_recommendation/test.ipynb similarity index 100% rename from examples/applications/test.ipynb rename to examples/applications/products_recommendation/test.ipynb diff --git a/examples/applications/recommendation/__pycache__/prepare_dataset.cpython-310.pyc b/examples/applications/recommendation/__pycache__/prepare_dataset.cpython-310.pyc deleted file mode 100644 index 334918248b0038345364fc10337498871cad89c2..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1415 zcmZ8hOK&SR7_~i*q)D2lZBzsjC}Kqr_jbjmssQQ2T^1yekh&T9j(w+-PR4`n^bsYi zvds^GRm+ZFfcPukvg)1{E6z75QZKf~kI%>7^LWheZjYh;@uMy-Q^x+H&GRO(`3j%= z41;2dSL~d(vp5&+9G%BXsOV>Qo~T$Qm{Y~SXVdf%H`rND<9{u}g1%Sy+-nRT8t*t~ zp5F^49{8Fh(Td#(&Q?st@3A%a(S5w)I1@NaR$`X4r=nmg{f)oRV5@VsJbGod^krR4 zyp?553njf=etaqIrM%LWl`}azmUFo@^$RB}?OqVTX}>)BPWp?9H|-4wCT~j|pDg=N z@Rzx}T8_W9Wl`3$YEct+Asc-%jYF@oMlEuml`3>|>2+c3ZRjkVwj@ez>(w+3Nn^{} zhn-7V6qUYMya}Tx2Qp=@>&i$KL<1!VL}f4;lGeyH3ULDm!w$hS@-1|YEi&5v&g>rY z_|NIFY?@rM|IVX>TSfV3u%AD*IW%i2av(4<1 zA|BRNr`sR^C`1mQEh!DTEG7keQYh~t{^WYTlv9hKqe*J~opI^QvoGX_)bWqM#FV$ko z?T45wJ+kHao0|qYw0h3iGdfsI9Vy*Pe!$1#NW{@djQKb3mq@kWxOSYm;R= qq%?t7&8__e>unwh`OGpIu&w#-=^X!wN`FD6c0__Mo)@|p^!^25{fROF From fdfb81fb27cab73377e095984cdd5bede8002401 Mon Sep 17 00:00:00 2001 From: Ahmed Khalid Date: Tue, 24 Dec 2024 11:30:06 +0000 Subject: [PATCH 17/22] add TMClassifier --- .../prepare_dataset.cpython-310.pyc | Bin 4044 -> 4044 bytes .../products_recommendation/baseline.py | 114 ++++++++++++++++++ 2 files changed, 114 insertions(+) create mode 100644 examples/applications/products_recommendation/baseline.py diff --git a/examples/applications/products_recommendation/__pycache__/prepare_dataset.cpython-310.pyc b/examples/applications/products_recommendation/__pycache__/prepare_dataset.cpython-310.pyc index d6947f000940c5e0a9189502c8ece11503498ecf..7742db56bf6d6b0f6731787e01a26b4ff39baead 100644 GIT binary patch delta 19 ZcmX>je@31wpO=@50SKatH*y`~2LLh>1kL~e delta 19 ZcmX>je@31wpO=@50SF@UH*y`~2LLhb1jhgX diff --git a/examples/applications/products_recommendation/baseline.py b/examples/applications/products_recommendation/baseline.py new file mode 100644 index 0000000..f1d3727 --- /dev/null +++ b/examples/applications/products_recommendation/baseline.py @@ -0,0 +1,114 @@ +import logging +import argparse +import numpy as np +from tmu.models.classification.vanilla_classifier import TMClassifier +from tmu.tools import BenchmarkTimer +from tmu.util.cuda_profiler import CudaProfiler +from sklearn.model_selection import train_test_split +from sklearn.preprocessing import LabelEncoder +import prepare_dataset +from tmu.data import MNIST +from sklearn.preprocessing import OneHotEncoder +import pandas as pd + +_LOGGER = logging.getLogger(__name__) + +def metrics(args): + return dict( + accuracy=[], + train_time=[], + test_time=[], + args=vars(args) + ) + +def prepare_data(): + # Step 1: Load and encode dataset + data = prepare_dataset.aug_amazon_products() + le_user = LabelEncoder() + le_item = LabelEncoder() + le_category = LabelEncoder() + le_rating = LabelEncoder() + data['user_id'] = le_user.fit_transform(data['user_id']) + data['product_id'] = le_item.fit_transform(data['product_id']) + data['category'] = le_category.fit_transform(data['category']) + data['rating'] = le_rating.fit_transform(data['rating']) + + x = data[['user_id', 'product_id', 'category']].values + y = data['rating'].values + # Step 3: One-hot encode features + encoder = OneHotEncoder(sparse_output=False, dtype=np.uint32) + x_binary = encoder.fit_transform(x) + + # Verify feature dimensions + print(f"Number of features after one-hot encoding: {x_binary.shape[1]}") + + x_train, x_test, y_train, y_test = train_test_split(x_binary, y, test_size=0.2, random_state=42) + + y_train = y_train.astype(np.uint32) + y_test = y_test.astype(np.uint32) + + print("x_train shape:", x_train.shape, "dtype:", x_train.dtype) + print("y_train shape:", y_train.shape, "dtype:", y_train.dtype) + print("x_test shape:", x_test.shape, "dtype:", x_test.dtype) + print("y_test shape:", y_test.shape, "dtype:", y_test.dtype) + + return x_train, x_test, y_train, y_test + +def main(args): + experiment_results = metrics(args) + X_train, X_test, Y_train, Y_test = prepare_data() + + tm = TMClassifier( + number_of_clauses=args.num_clauses, + T=args.T, + s=args.s, + max_included_literals=args.max_included_literals, + platform=args.platform, + weighted_clauses=args.weighted_clauses + ) + _LOGGER.info(f"Running {TMClassifier} for {args.epochs}") + for epoch in range(args.epochs): + benchmark_total = BenchmarkTimer(logger=None, text="Epoch Time") + with benchmark_total: + benchmark1 = BenchmarkTimer(logger=None, text="Training Time") + with benchmark1: + res = tm.fit( + X_train, + Y_train, + ) + + experiment_results["train_time"].append(benchmark1.elapsed()) + benchmark2 = BenchmarkTimer(logger=None, text="Testing Time") + with benchmark2: + result = 100 * (tm.predict(X_test) == Y_test).mean() + experiment_results["accuracy"].append(result) + experiment_results["test_time"].append(benchmark2.elapsed()) + + _LOGGER.info(f"Epoch: {epoch + 1}, Accuracy: {result:.2f}, Training Time: {benchmark1.elapsed():.2f}s, " + f"Testing Time: {benchmark2.elapsed():.2f}s") + + if args.platform == "CUDA": + CudaProfiler().print_timings(benchmark=benchmark_total) + + return experiment_results + + +def default_args(**kwargs): + parser = argparse.ArgumentParser() + parser.add_argument("--num_clauses", default=2000, type=int) + parser.add_argument("--T", default=5000, type=int) + parser.add_argument("--s", default=10.0, type=float) + parser.add_argument("--max_included_literals", default=32, type=int) + parser.add_argument("--platform", default="CPU_sparse", type=str, choices=["CPU", "CPU_sparse", "CUDA"]) + parser.add_argument("--weighted_clauses", default=True, type=bool) + parser.add_argument("--epochs", default=60, type=int) + args = parser.parse_args() + for key, value in kwargs.items(): + if key in args.__dict__: + setattr(args, key, value) + return args + + +if __name__ == "__main__": + results = main(default_args()) + _LOGGER.info(results) \ No newline at end of file From 3168dc7c889fafd0adada91d73b99dd983d64a5b Mon Sep 17 00:00:00 2001 From: Ahmed Khalid Date: Tue, 24 Dec 2024 17:12:57 +0000 Subject: [PATCH 18/22] update --- .../prepare_dataset.cpython-310.pyc | Bin 4044 -> 5371 bytes .../products_recommendation/baseline.py | 49 ++------------- .../products_recommendation/main.py | 59 ++++++------------ .../prepare_dataset.py | 44 +++++++++++-- 4 files changed, 64 insertions(+), 88 deletions(-) diff --git a/examples/applications/products_recommendation/__pycache__/prepare_dataset.cpython-310.pyc b/examples/applications/products_recommendation/__pycache__/prepare_dataset.cpython-310.pyc index 7742db56bf6d6b0f6731787e01a26b4ff39baead..a91b323f497e840523e521b9b11aa792ba20e8ba 100644 GIT binary patch literal 5371 zcma)A-ESOM6`y-&c4qd|>yK;_3dIykS*kcy5eTIwDKs$!iu(z)re&Gz9j|Azp4r?P zr}i?7P#vl5gC!6G3B<#uZ{dOegMZ<^A&|f?yzo{b%I}<6+w0VU?rP4QIrn4kJs-b& z&Ls7E#lr99zx8|3XDsWl^f3Fg@$f9J^c@goDKW4%1mER`J)W;rp z+x~{nyaT?xQ3hX9?p2=!UyQ@&lWZ#J z($J$HZH%txaHVS?88){sgq4YjRPvq}lf=%geJQNmQqFm6EHZoI5}a z(YKM?TZaja6_#VoxawV#!Yu_LfyuT;^83ZhKB?O6h~1nPGF8G zR^jd;=@gxvO&xSiO$V_`wmWGC|3X(s>14QeAI)hr3XKOt@=T=sw$P8^rimXd-Pj#s zXsAv(?UIr4kUWzn#T;R%*rK|IF{Y9wop@(E)KS;C^rAR0Zjfes!_e5|!Pr#{-@-=N z+e}r5*iW%;V-La@J+_jl8=4aD#gy=-9tIB*%(xRrw|7EarZtjhAu_6Q5~X2>A|=uD zv<8QK+cdC@4z=xdbnIIKUF?nj{i>W%(po_wkW1%36G|ghL)9XCo(lmeiX^@(5 zMN2frf>@F**|e9$vaHEZiJEkTzJd;2T*p6JuZN5M`sZL~{aVQ}e0dCy9ZTgD{x-2Q zhkTiQ>z*w9jbs2f!4mc=mH+}-;_{r-2cF9kFDp&lyp+4S2fy_G-~a~w+=rj}>$-(i z98#?5M~M4}4GzbNbqL0~?`T@C=63Q0_Q6p^Qiu<^^+q4+f&Y=`Z# zDU<#Q$kKw1vT!>sta74r)1+STix8g5fCGPI%u9_wML}>h7I>JN%E?guHF_`M*Nn|7 zfzG1Ms2c@?FQFw}2B`^W`d^X@aseiDq!h*9|F`4n=h5kux#l3C{gW!vrx8qNn#+D) zz+6(5#`e2nQz%z?cm0gcA90<4!(pjEmBLIUeZQE?Df;CS#w|=q5;7%8+@3Xt@yK-h z^1Aib<8R8bKQ6y#<>iakxRO_LC-*N}cP#xz?oA5AJAk4}52qkOLl|j8;`ubNiSVPE=pDZtrIPq)aP&*TSCZ;P-Qv zc9wf;fnBVTH{fE6Qx{Xv;UT-oDmZ-(;h-B{T|-MsKT}cr(L+YvI=wx2I`kf6~r_dS;%mq4r81K`UbU;#r1U}FA{kb#MEbZw^tw;A0+zg)NDs;_XLPw zo7ZV1I~8l-;zM1AOq0?08Y=9IlAL-HsaY5GX7WfO05o|SaF>u|) z@KeCmgy0ysS`Py*2f$SVaJf_9@|1rDaFwUh8Q?-HO+a9^IIEdH2|PUrSdSEdRlwA^ zG6k+`Ud=1Fq<)#eN#QgCP(8zZrQm14sm8%ia7y4*0|?asPW8Ob;M8C^Kr1Se`V^cR z0H=Odo>Y#(iK>$maB4C*&E<0djrjtc^mm{y{aqr@fJ9Ar*4YSELNmt&9h)4x9>s2s z*l1ZZF)No|y3rB#Y?F$JO$ zRBEsU1rbZA_^?@GPbD@a4ojS=#DN6pu1Zt^_mP26wagr+i<@+=y6b1Afc68@wykX} z*VH$otb^h?PB#<1ZQMJ-U?)tGF_Kb^HwgK#WQGvcR3Vw389-G$6j`4UyGHEI_#PXf zj+!COG}XzwoxR^+#vh-*1+EL?mQc)C7>;DfMd#9d9b+qH-$We10%_oo0G;M0O_R%p35)k7}r#5ae zhw3a3?b*MTO|7v3pwYE{m^djqQx; zXgtxnCd_V^H3cWb7HUvrkzaXQZ;^!5qq$^1E=l!>M);f~ zAEX70R?uWhD7%Jx#fHwOw+3OLTyPG+8)Jh+Gbb@3l#oIaP3#HH+ zxZ>qHO5dbM!Z(h?LP4~enGNxw!h!9t;+;E#@G4T3qOSJwv52xB`unm{_Rst0{7e3m Ie!b@Z7rInPIHVq@1toDl+x+w=mn^r}&EZBpw$AR5#zPz(Mh2w9Ck z5|Zz8&O&XEwOOdwj24q(l1WYK9RYiZwOGxRqQa#i&5q#I2jfvHwDx&xH=)qPbc+!s zRb)i7I-<>6yk<+vTG|~^%aAwI;cYInZ4p{65oTL_j~CeWu5LNCtL8`Z!sD=(}zqB&#nY971<9I(c`?Mm6(@^%{aa#JyRoJXuf@p_j1Efcir8vvuuq zvMkmsDWvZjL(BA6AU(ahtky#kSS-98>)U~^x?G({oC9H^_4F4KrmG(op)1r|G#KmE z(BD#T59YyICAdm0K*`fgy$0uHK$0E2C0gwWJVz<(Vsy(sHa95s9UAEJIFH!nO)vB- z4OR7nCz*X`Us*j%6xSZgERpaeBBIJz+o?8V(`z>Ux>R#;rvXa@ZeA_JJ_lF|%?}3mg=0x9TZu9cvT=p!V!q%t~;1b|8 zLDhMYEbLD)x}Te!8B}@&l&95;e#*VO{0T+|L`d=sk$m^z_W+Q#q^`kyqN%l@uA~1{ zm>Yc`j=y@J;7b@cqlY8&yNR-y!;~dG>DW|!^#Rx)0up7DV>OW1c}89^*8zu0OapPD zsr);>Qof9q^Y-ip$R7b9z4{o?mtNgM*NN`t*F*dQt{t0AI?i5ju*WI##YTO5tNGbM v70C^83X?7Z$^c9p(j`$SxeTnOExG~j`R6z>0=O)?#Oc+nv^8msXXpO~)Wa`t diff --git a/examples/applications/products_recommendation/baseline.py b/examples/applications/products_recommendation/baseline.py index f1d3727..b390764 100644 --- a/examples/applications/products_recommendation/baseline.py +++ b/examples/applications/products_recommendation/baseline.py @@ -1,15 +1,9 @@ import logging import argparse -import numpy as np from tmu.models.classification.vanilla_classifier import TMClassifier from tmu.tools import BenchmarkTimer from tmu.util.cuda_profiler import CudaProfiler -from sklearn.model_selection import train_test_split -from sklearn.preprocessing import LabelEncoder import prepare_dataset -from tmu.data import MNIST -from sklearn.preprocessing import OneHotEncoder -import pandas as pd _LOGGER = logging.getLogger(__name__) @@ -21,42 +15,11 @@ def metrics(args): args=vars(args) ) -def prepare_data(): - # Step 1: Load and encode dataset - data = prepare_dataset.aug_amazon_products() - le_user = LabelEncoder() - le_item = LabelEncoder() - le_category = LabelEncoder() - le_rating = LabelEncoder() - data['user_id'] = le_user.fit_transform(data['user_id']) - data['product_id'] = le_item.fit_transform(data['product_id']) - data['category'] = le_category.fit_transform(data['category']) - data['rating'] = le_rating.fit_transform(data['rating']) - - x = data[['user_id', 'product_id', 'category']].values - y = data['rating'].values - # Step 3: One-hot encode features - encoder = OneHotEncoder(sparse_output=False, dtype=np.uint32) - x_binary = encoder.fit_transform(x) - - # Verify feature dimensions - print(f"Number of features after one-hot encoding: {x_binary.shape[1]}") - - x_train, x_test, y_train, y_test = train_test_split(x_binary, y, test_size=0.2, random_state=42) - - y_train = y_train.astype(np.uint32) - y_test = y_test.astype(np.uint32) - - print("x_train shape:", x_train.shape, "dtype:", x_train.dtype) - print("y_train shape:", y_train.shape, "dtype:", y_train.dtype) - print("x_test shape:", x_test.shape, "dtype:", x_test.dtype) - print("y_test shape:", y_test.shape, "dtype:", y_test.dtype) - - return x_train, x_test, y_train, y_test - def main(args): experiment_results = metrics(args) - X_train, X_test, Y_train, Y_test = prepare_data() + data = prepare_dataset.aug_amazon_products() + x, y = prepare_dataset.construct_x_y(data) + X_train, X_test, Y_train, Y_test = prepare_dataset.one_hot_encoding(x,y) tm = TMClassifier( number_of_clauses=args.num_clauses, @@ -92,23 +55,21 @@ def main(args): return experiment_results - def default_args(**kwargs): parser = argparse.ArgumentParser() parser.add_argument("--num_clauses", default=2000, type=int) - parser.add_argument("--T", default=5000, type=int) + parser.add_argument("--T", default=10000, type=int) parser.add_argument("--s", default=10.0, type=float) parser.add_argument("--max_included_literals", default=32, type=int) parser.add_argument("--platform", default="CPU_sparse", type=str, choices=["CPU", "CPU_sparse", "CUDA"]) parser.add_argument("--weighted_clauses", default=True, type=bool) - parser.add_argument("--epochs", default=60, type=int) + parser.add_argument("--epochs", default=10, type=int) args = parser.parse_args() for key, value in kwargs.items(): if key in args.__dict__: setattr(args, key, value) return args - if __name__ == "__main__": results = main(default_args()) _LOGGER.info(results) \ No newline at end of file diff --git a/examples/applications/products_recommendation/main.py b/examples/applications/products_recommendation/main.py index e045607..41168a9 100644 --- a/examples/applications/products_recommendation/main.py +++ b/examples/applications/products_recommendation/main.py @@ -2,16 +2,13 @@ from GraphTsetlinMachine.tm import MultiClassGraphTsetlinMachine from time import time import argparse -import pandas as pd import numpy as np -from sklearn.model_selection import train_test_split -from sklearn.preprocessing import LabelEncoder import prepare_dataset def default_args(**kwargs): parser = argparse.ArgumentParser() - parser.add_argument("--epochs", default=100, type=int) - parser.add_argument("--number-of-clauses", default=1000, type=int) + parser.add_argument("--epochs", default=10, type=int) + parser.add_argument("--number-of-clauses", default=2000, type=int) parser.add_argument("--T", default=10000, type=int) parser.add_argument("--s", default=10.0, type=float) parser.add_argument("--number-of-state-bits", default=8, type=int) @@ -22,7 +19,7 @@ def default_args(**kwargs): parser.add_argument("--message-bits", default=2, type=int) parser.add_argument('--double-hashing', dest='double_hashing', default=False, action='store_true') parser.add_argument("--noise", default=0.01, type=float) - parser.add_argument("--max-included-literals", default=3, type=int) + parser.add_argument("--max-included-literals", default=23, type=int) args = parser.parse_args() for key, value in kwargs.items(): @@ -38,21 +35,8 @@ def default_args(**kwargs): # data = prepare_dataset.artificial_with_user_pref() # data = prepare_dataset.artificial_pattered() # print(data.head()) -le_user = LabelEncoder() -le_item = LabelEncoder() -le_category = LabelEncoder() -le_rating = LabelEncoder() -data['user_id'] = le_user.fit_transform(data['user_id']) -data['product_id'] = le_item.fit_transform(data['product_id']) -data['category'] = le_category.fit_transform(data['category']) -data['rating'] = le_rating.fit_transform(data['rating']) -x = data[['user_id', 'product_id', 'category']].values -y = data['rating'].values -X_train, X_test, Y_train, Y_test = train_test_split(x, y, test_size=0.2, random_state=42) -print("X_train shape:", X_train.shape) -print("y_train shape:", Y_train.shape) -print("X_test shape:", X_test.shape) -print("y_test shape:", Y_test.shape) +x, y = prepare_dataset.construct_x_y(data) +X_train, X_test, Y_train, Y_test = prepare_dataset.train_test_split(x,y) users = data['user_id'].unique() items = data['product_id'].unique() categories = data['category'].unique() @@ -160,24 +144,21 @@ def default_args(**kwargs): # weights = tm.get_state()[1].reshape(2, -1) # for i in range(tm.number_of_clauses): -# print("Clause #%d W:(%d %d)" % (i, weights[0,i], weights[1,i]), end=' ') -# l = [] -# for k in range(args.hypervector_size * 2): -# if tm.ta_action(0, i, k): -# if k < args.hypervector_size: -# l.append("x%d" % (k)) -# else: -# l.append("NOT x%d" % (k - args.hypervector_size)) - -# for k in range(args.message_size * 2): -# if tm.ta_action(1, i, k): -# if k < args.message_size: -# l.append("c%d" % (k)) -# else: -# l.append("NOT c%d" % (k - args.message_size)) - -# print(" AND ".join(l)) - +# print("Clause #%d W:(%d %d)" % (i, weights[0,i], weights[1,i]), end=' ') +# l = [] +# for k in range(args.hypervector_size * 2): +# if tm.ta_action(0, i, k): +# if k < args.hypervector_size: +# l.append("x%d" % (k)) +# else: +# l.append("NOT x%d" % (k - args.hypervector_size)) +# for k in range(args.message_size * 2): +# if tm.ta_action(1, i, k): +# if k < args.message_size: +# l.append("c%d" % (k)) +# else: +# l.append("NOT c%d" % (k - args.message_size)) +# print(" AND ".join(l)) # print(graphs_test.hypervectors) # print(tm.hypervectors) # print(graphs_test.edge_type_id) \ No newline at end of file diff --git a/examples/applications/products_recommendation/prepare_dataset.py b/examples/applications/products_recommendation/prepare_dataset.py index 20162f0..dfe1b50 100644 --- a/examples/applications/products_recommendation/prepare_dataset.py +++ b/examples/applications/products_recommendation/prepare_dataset.py @@ -1,7 +1,9 @@ import pandas as pd import kagglehub import numpy as np - +from sklearn.model_selection import train_test_split +from sklearn.preprocessing import LabelEncoder +from sklearn.preprocessing import OneHotEncoder def amazon_products(): print("Creating training data") @@ -12,7 +14,7 @@ def amazon_products(): print("Original data shape:", org_data.shape) return org_data[['product_id', 'category', 'user_id', 'rating']] -def aug_amazon_products(): +def aug_amazon_products(noise_ratio = 0.01): np.random.seed(42) org_data = amazon_products() org_data['rating'] = pd.to_numeric(org_data['rating'], errors='coerce') # Coerce invalid values to NaN @@ -23,8 +25,6 @@ def aug_amazon_products(): # Shuffle the expanded dataset data = data.sample(frac=1, random_state=42).reset_index(drop=True) # Add noise - # Define the noise ratio - noise_ratio = 0.01 # 10% noise # Select rows to apply noise num_noisy_rows = int(noise_ratio * len(data)) noisy_indices = np.random.choice(data.index, size=num_noisy_rows, replace=False) @@ -144,4 +144,38 @@ def artificial_pattered(): 'category': [item_categories[item] for item in item_ids], 'rating': ratings }) - return data \ No newline at end of file + return data + +def construct_x_y(data): + le_user = LabelEncoder() + le_item = LabelEncoder() + le_category = LabelEncoder() + le_rating = LabelEncoder() + data['user_id'] = le_user.fit_transform(data['user_id']) + data['product_id'] = le_item.fit_transform(data['product_id']) + data['category'] = le_category.fit_transform(data['category']) + data['rating'] = le_rating.fit_transform(data['rating']) + x = data[['user_id', 'product_id', 'category']].values + y = data['rating'].values + return x,y + +def split_train_test(x,y): + X_train, X_test, Y_train, Y_test = train_test_split(x, y, test_size=0.2, random_state=42) + print("X_train shape:", X_train.shape) + print("y_train shape:", Y_train.shape) + print("X_test shape:", X_test.shape) + print("y_test shape:", Y_test.shape) + return X_train, X_test, Y_train, Y_test + +def one_hot_encoding(x,y): + encoder = OneHotEncoder(sparse_output=False, dtype=np.uint32) + x_binary = encoder.fit_transform(x) + # print(f"Number of features after one-hot encoding: {x_binary.shape[1]}") + x_train, x_test, y_train, y_test = split_train_test(x_binary, y) + y_train = y_train.astype(np.uint32) + y_test = y_test.astype(np.uint32) + print("x_train shape:", x_train.shape, "dtype:", x_train.dtype) + print("y_train shape:", y_train.shape, "dtype:", y_train.dtype) + print("x_test shape:", x_test.shape, "dtype:", x_test.dtype) + print("y_test shape:", y_test.shape, "dtype:", y_test.dtype) + return x_train, x_test, y_train, y_test \ No newline at end of file From e2232de44dfe505f19dfe0dffd298571cc485df3 Mon Sep 17 00:00:00 2001 From: Ahmed Khalid Date: Tue, 24 Dec 2024 18:54:16 +0000 Subject: [PATCH 19/22] add graph nn --- .../prepare_dataset.cpython-310.pyc | Bin 5371 -> 5371 bytes .../products_recommendation/graph_nn.py | 106 ++++++++++++++++++ .../{main.py => graph_tm.py} | 0 .../{baseline.py => tm_classifier.py} | 0 4 files changed, 106 insertions(+) create mode 100644 examples/applications/products_recommendation/graph_nn.py rename examples/applications/products_recommendation/{main.py => graph_tm.py} (100%) rename examples/applications/products_recommendation/{baseline.py => tm_classifier.py} (100%) diff --git a/examples/applications/products_recommendation/__pycache__/prepare_dataset.cpython-310.pyc b/examples/applications/products_recommendation/__pycache__/prepare_dataset.cpython-310.pyc index a91b323f497e840523e521b9b11aa792ba20e8ba..8a2cd18398677e0c9bd2a85d805bb970ed3cb003 100644 GIT binary patch delta 19 ZcmeyZ`CF4KpO=@50SLNYZ{+$Y0suYN1{(kX delta 19 ZcmeyZ`CF4KpO=@50SE%0Y~=bV0suT-1=|1s diff --git a/examples/applications/products_recommendation/graph_nn.py b/examples/applications/products_recommendation/graph_nn.py new file mode 100644 index 0000000..fa78480 --- /dev/null +++ b/examples/applications/products_recommendation/graph_nn.py @@ -0,0 +1,106 @@ +import torch +import torch.nn.functional as F +from torch_geometric.data import Data +from torch_geometric.nn import GCNConv +from time import time +import prepare_dataset + +# Step 1: Dataset Preparation + +data = prepare_dataset.aug_amazon_products() +x, y = prepare_dataset.construct_x_y(data) +X_train, X_test, Y_train, Y_test = prepare_dataset.train_test_split(x,y) + +# Graph Construction +num_users = len(data['user_id'].unique()) +num_items = len(data['product_id'].unique()) +num_categories = len(data['category'].unique()) +num_nodes = num_users + num_items + num_categories + +# Build edge list +edge_list = [] + +# User ↔ Item edges +for user, item in zip(X_train[:, 0], X_train[:, 1]): + edge_list.append((user, num_users + item)) # User to Item + edge_list.append((num_users + item, user)) # Item to User + +# Item ↔ Category edges +for item, category in zip(X_train[:, 1], X_train[:, 2]): + edge_list.append((num_users + item, num_users + num_items + category)) # Item to Category + edge_list.append((num_users + num_items + category, num_users + item)) # Category to Item + +# Create edge index for PyTorch Geometric +edge_index = torch.tensor(edge_list, dtype=torch.long).t() + +# Node features +node_features = torch.rand((num_nodes, 64), dtype=torch.float) + +# PyTorch Geometric Data object +graph_data = Data(x=node_features, edge_index=edge_index) + +# Step 2: Define GCN Model +class GCN(torch.nn.Module): + def __init__(self, input_dim, hidden_dim, output_dim): + super(GCN, self).__init__() + self.conv1 = GCNConv(input_dim, hidden_dim) + self.conv2 = GCNConv(hidden_dim, output_dim) + + def forward(self, x, edge_index): + x = self.conv1(x, edge_index) + x = F.relu(x) + x = self.conv2(x, edge_index) + return x + +# Initialize Model +model = GCN(input_dim=64, hidden_dim=128, output_dim=64) + +# Define optimizer +optimizer = torch.optim.Adam(model.parameters(), lr=0.01) + +# Convert train/test data to tensors +train_edges = torch.tensor( + [(user, num_users + item) for user, item in zip(X_train[:, 0], X_train[:, 1])], + dtype=torch.long +).t() +train_labels = torch.tensor(Y_train, dtype=torch.float) + +test_edges = torch.tensor( + [(user, num_users + item) for user, item in zip(X_test[:, 0], X_test[:, 1])], + dtype=torch.long +).t() +test_labels = torch.tensor(Y_test, dtype=torch.float) + +# Training Loop with Accuracy Logging +epochs = 1000 +for epoch in range(epochs): + start_time = time() + + # Training Phase + model.train() + optimizer.zero_grad() + out = model(graph_data.x, graph_data.edge_index) + + # User-item embeddings + user_embeddings = out[train_edges[0]] + item_embeddings = out[train_edges[1]] + predicted_ratings = (user_embeddings * item_embeddings).sum(dim=1) + + # Compute loss + loss = F.mse_loss(predicted_ratings, train_labels) + loss.backward() + optimizer.step() + + # Testing Phase + model.eval() + with torch.no_grad(): + out = model(graph_data.x, graph_data.edge_index) + test_user_embeddings = out[test_edges[0]] + test_item_embeddings = out[test_edges[1]] + test_predicted_ratings = (test_user_embeddings * test_item_embeddings).sum(dim=1) + + # Compute accuracy + test_accuracy = ((test_predicted_ratings.round() == test_labels).float().mean().item()) * 100 + + elapsed_time = time() - start_time + print(f"Epoch {epoch + 1}/{epochs}, Loss: {loss.item():.4f}, Accuracy: {test_accuracy:.2f}%, Time: {elapsed_time:.2f}s") diff --git a/examples/applications/products_recommendation/main.py b/examples/applications/products_recommendation/graph_tm.py similarity index 100% rename from examples/applications/products_recommendation/main.py rename to examples/applications/products_recommendation/graph_tm.py diff --git a/examples/applications/products_recommendation/baseline.py b/examples/applications/products_recommendation/tm_classifier.py similarity index 100% rename from examples/applications/products_recommendation/baseline.py rename to examples/applications/products_recommendation/tm_classifier.py From c4546310371c7dbcb0ccad959223c087c0a6669c Mon Sep 17 00:00:00 2001 From: Ahmed Khalid Date: Wed, 25 Dec 2024 13:04:24 +0000 Subject: [PATCH 20/22] add main.bash --- .../experiment_results.csv | 6 + .../products_recommendation/graph_nn.py | 226 +++++++------- .../products_recommendation/graph_tm.py | 277 +++++++++--------- .../products_recommendation/main.sh | 16 + .../products_recommendation/test.ipynb | 271 ----------------- .../products_recommendation/tm_classifier.py | 73 +++-- 6 files changed, 323 insertions(+), 546 deletions(-) create mode 100644 examples/applications/products_recommendation/experiment_results.csv create mode 100644 examples/applications/products_recommendation/main.sh delete mode 100644 examples/applications/products_recommendation/test.ipynb diff --git a/examples/applications/products_recommendation/experiment_results.csv b/examples/applications/products_recommendation/experiment_results.csv new file mode 100644 index 0000000..d3f66d2 --- /dev/null +++ b/examples/applications/products_recommendation/experiment_results.csv @@ -0,0 +1,6 @@ +Algorithm,Noise_Ratio,T,s,Max_Included_Literals,Epochs,Platform,Total_Time,Accuracy +Graph NN,0.005,0,0,0,1000,CPU,0.03006434440612793,76.72131061553955 +GraphTM,0.005,10000,10.0,23,10,CUDA,34.547648191452026,98.46994535519126 +TMClassifier,0.005,10000,10.0,32,10,CPU_sparse,89.6943154335022,76.63934426229508 +Graph NN,0.01,0,0,0,1000,CPU,0.01817464828491211,75.95628499984741 +GraphTM,0.01,10000,10.0,23,10,CUDA,34.95576763153076,98.44262295081967 diff --git a/examples/applications/products_recommendation/graph_nn.py b/examples/applications/products_recommendation/graph_nn.py index fa78480..30292db 100644 --- a/examples/applications/products_recommendation/graph_nn.py +++ b/examples/applications/products_recommendation/graph_nn.py @@ -1,106 +1,130 @@ +import argparse import torch import torch.nn.functional as F from torch_geometric.data import Data from torch_geometric.nn import GCNConv -from time import time import prepare_dataset - -# Step 1: Dataset Preparation - -data = prepare_dataset.aug_amazon_products() -x, y = prepare_dataset.construct_x_y(data) -X_train, X_test, Y_train, Y_test = prepare_dataset.train_test_split(x,y) - -# Graph Construction -num_users = len(data['user_id'].unique()) -num_items = len(data['product_id'].unique()) -num_categories = len(data['category'].unique()) -num_nodes = num_users + num_items + num_categories - -# Build edge list -edge_list = [] - -# User ↔ Item edges -for user, item in zip(X_train[:, 0], X_train[:, 1]): - edge_list.append((user, num_users + item)) # User to Item - edge_list.append((num_users + item, user)) # Item to User - -# Item ↔ Category edges -for item, category in zip(X_train[:, 1], X_train[:, 2]): - edge_list.append((num_users + item, num_users + num_items + category)) # Item to Category - edge_list.append((num_users + num_items + category, num_users + item)) # Category to Item - -# Create edge index for PyTorch Geometric -edge_index = torch.tensor(edge_list, dtype=torch.long).t() - -# Node features -node_features = torch.rand((num_nodes, 64), dtype=torch.float) - -# PyTorch Geometric Data object -graph_data = Data(x=node_features, edge_index=edge_index) - -# Step 2: Define GCN Model -class GCN(torch.nn.Module): - def __init__(self, input_dim, hidden_dim, output_dim): - super(GCN, self).__init__() - self.conv1 = GCNConv(input_dim, hidden_dim) - self.conv2 = GCNConv(hidden_dim, output_dim) - - def forward(self, x, edge_index): - x = self.conv1(x, edge_index) - x = F.relu(x) - x = self.conv2(x, edge_index) - return x - -# Initialize Model -model = GCN(input_dim=64, hidden_dim=128, output_dim=64) - -# Define optimizer -optimizer = torch.optim.Adam(model.parameters(), lr=0.01) - -# Convert train/test data to tensors -train_edges = torch.tensor( - [(user, num_users + item) for user, item in zip(X_train[:, 0], X_train[:, 1])], - dtype=torch.long -).t() -train_labels = torch.tensor(Y_train, dtype=torch.float) - -test_edges = torch.tensor( - [(user, num_users + item) for user, item in zip(X_test[:, 0], X_test[:, 1])], - dtype=torch.long -).t() -test_labels = torch.tensor(Y_test, dtype=torch.float) - -# Training Loop with Accuracy Logging -epochs = 1000 -for epoch in range(epochs): - start_time = time() - - # Training Phase - model.train() - optimizer.zero_grad() - out = model(graph_data.x, graph_data.edge_index) - - # User-item embeddings - user_embeddings = out[train_edges[0]] - item_embeddings = out[train_edges[1]] - predicted_ratings = (user_embeddings * item_embeddings).sum(dim=1) - - # Compute loss - loss = F.mse_loss(predicted_ratings, train_labels) - loss.backward() - optimizer.step() - - # Testing Phase - model.eval() - with torch.no_grad(): - out = model(graph_data.x, graph_data.edge_index) - test_user_embeddings = out[test_edges[0]] - test_item_embeddings = out[test_edges[1]] - test_predicted_ratings = (test_user_embeddings * test_item_embeddings).sum(dim=1) - - # Compute accuracy - test_accuracy = ((test_predicted_ratings.round() == test_labels).float().mean().item()) * 100 - - elapsed_time = time() - start_time - print(f"Epoch {epoch + 1}/{epochs}, Loss: {loss.item():.4f}, Accuracy: {test_accuracy:.2f}%, Time: {elapsed_time:.2f}s") +from tmu.tools import BenchmarkTimer +import os +import pandas as pd + +def main(args): + results = [] + data = prepare_dataset.aug_amazon_products(noise_ratio = args.dataset_noise_ratio) + x, y = prepare_dataset.construct_x_y(data) + X_train, X_test, Y_train, Y_test = prepare_dataset.train_test_split(x,y) + # Graph Construction + num_users = len(data['user_id'].unique()) + num_items = len(data['product_id'].unique()) + num_categories = len(data['category'].unique()) + num_nodes = num_users + num_items + num_categories + # Build edge list + edge_list = [] + # User ↔ Item edges + for user, item in zip(X_train[:, 0], X_train[:, 1]): + edge_list.append((user, num_users + item)) # User to Item + edge_list.append((num_users + item, user)) # Item to User + # Item ↔ Category edges + for item, category in zip(X_train[:, 1], X_train[:, 2]): + edge_list.append((num_users + item, num_users + num_items + category)) # Item to Category + edge_list.append((num_users + num_items + category, num_users + item)) # Category to Item + # Create edge index for PyTorch Geometric + edge_index = torch.tensor(edge_list, dtype=torch.long).t() + # Node features + node_features = torch.rand((num_nodes, 64), dtype=torch.float) + # PyTorch Geometric Data object + graph_data = Data(x=node_features, edge_index=edge_index) + # Step 2: Define GCN Model + class GCN(torch.nn.Module): + def __init__(self, input_dim, hidden_dim, output_dim): + super(GCN, self).__init__() + self.conv1 = GCNConv(input_dim, hidden_dim) + self.conv2 = GCNConv(hidden_dim, output_dim) + def forward(self, x, edge_index): + x = self.conv1(x, edge_index) + x = F.relu(x) + x = self.conv2(x, edge_index) + return x + # Initialize Model + model = GCN(input_dim=64, hidden_dim=128, output_dim=64) + # Define optimizer + optimizer = torch.optim.Adam(model.parameters(), lr=0.01) + # Convert train/test data to tensors + train_edges = torch.tensor( + [(user, num_users + item) for user, item in zip(X_train[:, 0], X_train[:, 1])], + dtype=torch.long + ).t() + train_labels = torch.tensor(Y_train, dtype=torch.float) + test_edges = torch.tensor( + [(user, num_users + item) for user, item in zip(X_test[:, 0], X_test[:, 1])], + dtype=torch.long + ).t() + test_labels = torch.tensor(Y_test, dtype=torch.float) + # Training Loop with Accuracy Logging + for epoch in range(args.epochs): + benchmark_total = BenchmarkTimer(logger=None, text="Epoch Time") + with benchmark_total: + benchmark1 = BenchmarkTimer(logger=None, text="Training Time") + with benchmark1: + # Training Phase + model.train() + optimizer.zero_grad() + out = model(graph_data.x, graph_data.edge_index) + # User-item embeddings + user_embeddings = out[train_edges[0]] + item_embeddings = out[train_edges[1]] + predicted_ratings = (user_embeddings * item_embeddings).sum(dim=1) + # Compute loss + loss = F.mse_loss(predicted_ratings, train_labels) + loss.backward() + optimizer.step() + train_time = benchmark1.elapsed() + # Testing Phase + benchmark2 = BenchmarkTimer(logger=None, text="Testing Time") + with benchmark2: + model.eval() + with torch.no_grad(): + out = model(graph_data.x, graph_data.edge_index) + test_user_embeddings = out[test_edges[0]] + test_item_embeddings = out[test_edges[1]] + test_predicted_ratings = (test_user_embeddings * test_item_embeddings).sum(dim=1) + # Compute accuracy + accuracy = ((test_predicted_ratings.round() == test_labels).float().mean().item()) * 100 + test_time = benchmark2.elapsed() + total_time = benchmark_total.elapsed() + # Append results for each epoch + results.append({ + "Algorithm": "Graph NN", + "Noise_Ratio": args.dataset_noise_ratio, + "T": 0, + "s": 0, + "Max_Included_Literals": 0, + "Epochs": args.epochs, + "Platform": args.platform, + "Total_Time": total_time, + "Accuracy": accuracy, + }) + + # Save results to CSV + results_df = pd.DataFrame(results) + results_file = "experiment_results.csv" + if os.path.exists(results_file): + results_df.to_csv(results_file, mode='a', index=False, header=False) + else: + results_df.to_csv(results_file, index=False) + print(f"Results saved to {results_file}") + + +def default_args(**kwargs): + parser = argparse.ArgumentParser() + parser.add_argument("--platform", default="CPU", type=str, choices=["CPU", "CUDA"]) + parser.add_argument("--epochs", default=1000, type=int) + parser.add_argument("--dataset_noise_ratio", default=0.01, type=float) + args = parser.parse_args() + for key, value in kwargs.items(): + if key in args.__dict__: + setattr(args, key, value) + return args + +if __name__ == "__main__": + main(default_args()) \ No newline at end of file diff --git a/examples/applications/products_recommendation/graph_tm.py b/examples/applications/products_recommendation/graph_tm.py index 41168a9..0ec2171 100644 --- a/examples/applications/products_recommendation/graph_tm.py +++ b/examples/applications/products_recommendation/graph_tm.py @@ -1,9 +1,145 @@ from GraphTsetlinMachine.graphs import Graphs from GraphTsetlinMachine.tm import MultiClassGraphTsetlinMachine -from time import time import argparse import numpy as np import prepare_dataset +import pandas as pd +from tmu.tools import BenchmarkTimer +import os + +def main(args): + np.random.seed(42) + results = [] + data = prepare_dataset.aug_amazon_products(noise_ratio = args.dataset_noise_ratio) + x, y = prepare_dataset.construct_x_y(data) + X_train, X_test, Y_train, Y_test = prepare_dataset.train_test_split(x,y) + users = data['user_id'].unique() + items = data['product_id'].unique() + categories = data['category'].unique() + # Initialize Graphs with symbols for GTM + number_of_nodes = 3 + symbols = [] + symbols = ["U_" + str(u) for u in users] + ["I_" + str(i) for i in items] + ["C_" + str(c) for c in categories] + print("Symbols: ",len(symbols)) + + # Train data + graphs_train = Graphs( + X_train.shape[0], + symbols=symbols, + hypervector_size=args.hypervector_size, + hypervector_bits=args.hypervector_bits, + double_hashing = args.double_hashing + ) + for graph_id in range(X_train.shape[0]): + graphs_train.set_number_of_graph_nodes(graph_id, number_of_nodes) + graphs_train.prepare_node_configuration() + for graph_id in range(X_train.shape[0]): + for node_id in range(graphs_train.number_of_graph_nodes[graph_id]): + number_of_edges = 2 if node_id > 0 and node_id < graphs_train.number_of_graph_nodes[graph_id]-1 else 1 + if node_id == 0: + graphs_train.add_graph_node(graph_id, "User", number_of_edges) + elif node_id == 1: + graphs_train.add_graph_node(graph_id, "Item", number_of_edges) + else: + graphs_train.add_graph_node(graph_id, "Category", number_of_edges) + graphs_train.prepare_edge_configuration() + for graph_id in range(X_train.shape[0]): + for node_id in range(graphs_train.number_of_graph_nodes[graph_id]): + if node_id == 0: + graphs_train.add_graph_node_edge(graph_id, "User", "Item", "UserItem") + + if node_id == 1: + graphs_train.add_graph_node_edge(graph_id, "Item", "Category", "ItemCategory") + graphs_train.add_graph_node_edge(graph_id, "Item", "User", "ItemUser") + + if node_id == 2: + graphs_train.add_graph_node_edge(graph_id, "Category", "Item", "CatrgoryItem") + + graphs_train.add_graph_node_property(graph_id, "User", "U_" + str(X_train[graph_id][0])) + graphs_train.add_graph_node_property(graph_id, "Item", "I_" + str(X_train[graph_id][1])) + graphs_train.add_graph_node_property(graph_id, "Category", "C_" + str(X_train[graph_id][2])) + graphs_train.encode() + print("Training data produced") + + # Test data + graphs_test = Graphs(X_test.shape[0], init_with=graphs_train) + for graph_id in range(X_test.shape[0]): + graphs_test.set_number_of_graph_nodes(graph_id, number_of_nodes) + graphs_test.prepare_node_configuration() + for graph_id in range(X_test.shape[0]): + for node_id in range(graphs_test.number_of_graph_nodes[graph_id]): + number_of_edges = 2 if node_id > 0 and node_id < graphs_test.number_of_graph_nodes[graph_id]-1 else 1 + if node_id == 0: + graphs_test.add_graph_node(graph_id, "User", number_of_edges) + elif node_id == 1: + graphs_test.add_graph_node(graph_id, "Item", number_of_edges) + else: + graphs_test.add_graph_node(graph_id, "Category", number_of_edges) + graphs_test.prepare_edge_configuration() + for graph_id in range(X_test.shape[0]): + for node_id in range(graphs_test.number_of_graph_nodes[graph_id]): + if node_id == 0: + graphs_test.add_graph_node_edge(graph_id, "User", "Item", "UserItem") + + if node_id == 1: + graphs_test.add_graph_node_edge(graph_id, "Item", "Category", "ItemCategory") + graphs_test.add_graph_node_edge(graph_id, "Item", "User", "ItemUser") + + if node_id == 2: + graphs_test.add_graph_node_edge(graph_id, "Category", "Item", "CatrgoryItem") + + graphs_test.add_graph_node_property(graph_id, "User", "U_" + str(X_test[graph_id][0])) + graphs_test.add_graph_node_property(graph_id, "Item", "I_" + str(X_test[graph_id][1])) + graphs_test.add_graph_node_property(graph_id, "Category", "C_" + str(X_test[graph_id][2])) + graphs_test.encode() + print("Testing data produced") + + tm = MultiClassGraphTsetlinMachine( + args.number_of_clauses, + args.T, + args.s, + number_of_state_bits = args.number_of_state_bits, + depth=args.depth, + message_size=args.message_size, + message_bits=args.message_bits, + max_included_literals=args.max_included_literals, + double_hashing = args.double_hashing + ) + + for epoch in range(args.epochs): + benchmark_total = BenchmarkTimer(logger=None, text="Epoch Time") + with benchmark_total: + benchmark1 = BenchmarkTimer(logger=None, text="Training Time") + with benchmark1: + tm.fit(graphs_train, Y_train, epochs=1, incremental=True) + train_time = benchmark1.elapsed() + + benchmark2 = BenchmarkTimer(logger=None, text="Testing Time") + with benchmark2: + accuracy = 100*(tm.predict(graphs_test) == Y_test).mean() + test_time = benchmark2.elapsed() + total_time = benchmark_total.elapsed() + # result_train = 100*(tm.predict(graphs_train) == Y_train).mean() + results.append({ + "Algorithm": "GraphTM", + "Noise_Ratio": args.dataset_noise_ratio, + "T": args.T, + "s": args.s, + "Max_Included_Literals": args.max_included_literals, + "Epochs": args.epochs, + "Platform": "CUDA", + "Total_Time": total_time, + "Accuracy": accuracy, + }) + + # Save results to CSV + results_df = pd.DataFrame(results) + results_file = "experiment_results.csv" + if os.path.exists(results_file): + results_df.to_csv(results_file, mode='a', index=False, header=False) + else: + results_df.to_csv(results_file, index=False) + print(f"Results saved to {results_file}") def default_args(**kwargs): parser = argparse.ArgumentParser() @@ -20,145 +156,12 @@ def default_args(**kwargs): parser.add_argument('--double-hashing', dest='double_hashing', default=False, action='store_true') parser.add_argument("--noise", default=0.01, type=float) parser.add_argument("--max-included-literals", default=23, type=int) - + parser.add_argument("--dataset_noise_ratio", default=0.01, type=float) args = parser.parse_args() for key, value in kwargs.items(): if key in args.__dict__: setattr(args, key, value) return args -args = default_args() -np.random.seed(42) - -# data = prepare_dataset.amazon_products() -data = prepare_dataset.aug_amazon_products() -# data = prepare_dataset.artificial() -# data = prepare_dataset.artificial_with_user_pref() -# data = prepare_dataset.artificial_pattered() -# print(data.head()) -x, y = prepare_dataset.construct_x_y(data) -X_train, X_test, Y_train, Y_test = prepare_dataset.train_test_split(x,y) -users = data['user_id'].unique() -items = data['product_id'].unique() -categories = data['category'].unique() -# Initialize Graphs with symbols for GTM -number_of_nodes = 3 -symbols = [] -symbols = ["U_" + str(u) for u in users] + ["I_" + str(i) for i in items] + ["C_" + str(c) for c in categories] -print("Symbols: ",len(symbols)) - -# Train data -graphs_train = Graphs( - X_train.shape[0], - symbols=symbols, - hypervector_size=args.hypervector_size, - hypervector_bits=args.hypervector_bits, - double_hashing = args.double_hashing -) -for graph_id in range(X_train.shape[0]): - graphs_train.set_number_of_graph_nodes(graph_id, number_of_nodes) -graphs_train.prepare_node_configuration() -for graph_id in range(X_train.shape[0]): - for node_id in range(graphs_train.number_of_graph_nodes[graph_id]): - number_of_edges = 2 if node_id > 0 and node_id < graphs_train.number_of_graph_nodes[graph_id]-1 else 1 - if node_id == 0: - graphs_train.add_graph_node(graph_id, "User", number_of_edges) - elif node_id == 1: - graphs_train.add_graph_node(graph_id, "Item", number_of_edges) - else: - graphs_train.add_graph_node(graph_id, "Category", number_of_edges) -graphs_train.prepare_edge_configuration() -for graph_id in range(X_train.shape[0]): - for node_id in range(graphs_train.number_of_graph_nodes[graph_id]): - if node_id == 0: - graphs_train.add_graph_node_edge(graph_id, "User", "Item", "UserItem") - - if node_id == 1: - graphs_train.add_graph_node_edge(graph_id, "Item", "Category", "ItemCategory") - graphs_train.add_graph_node_edge(graph_id, "Item", "User", "ItemUser") - - if node_id == 2: - graphs_train.add_graph_node_edge(graph_id, "Category", "Item", "CatrgoryItem") - - graphs_train.add_graph_node_property(graph_id, "User", "U_" + str(X_train[graph_id][0])) - graphs_train.add_graph_node_property(graph_id, "Item", "I_" + str(X_train[graph_id][1])) - graphs_train.add_graph_node_property(graph_id, "Category", "C_" + str(X_train[graph_id][2])) -graphs_train.encode() -print("Training data produced") - -# Test data -graphs_test = Graphs(X_test.shape[0], init_with=graphs_train) -for graph_id in range(X_test.shape[0]): - graphs_test.set_number_of_graph_nodes(graph_id, number_of_nodes) -graphs_test.prepare_node_configuration() -for graph_id in range(X_test.shape[0]): - for node_id in range(graphs_test.number_of_graph_nodes[graph_id]): - number_of_edges = 2 if node_id > 0 and node_id < graphs_test.number_of_graph_nodes[graph_id]-1 else 1 - if node_id == 0: - graphs_test.add_graph_node(graph_id, "User", number_of_edges) - elif node_id == 1: - graphs_test.add_graph_node(graph_id, "Item", number_of_edges) - else: - graphs_test.add_graph_node(graph_id, "Category", number_of_edges) -graphs_test.prepare_edge_configuration() -for graph_id in range(X_test.shape[0]): - for node_id in range(graphs_test.number_of_graph_nodes[graph_id]): - if node_id == 0: - graphs_test.add_graph_node_edge(graph_id, "User", "Item", "UserItem") - - if node_id == 1: - graphs_test.add_graph_node_edge(graph_id, "Item", "Category", "ItemCategory") - graphs_test.add_graph_node_edge(graph_id, "Item", "User", "ItemUser") - - if node_id == 2: - graphs_test.add_graph_node_edge(graph_id, "Category", "Item", "CatrgoryItem") - - graphs_test.add_graph_node_property(graph_id, "User", "U_" + str(X_test[graph_id][0])) - graphs_test.add_graph_node_property(graph_id, "Item", "I_" + str(X_test[graph_id][1])) - graphs_test.add_graph_node_property(graph_id, "Category", "C_" + str(X_test[graph_id][2])) -graphs_test.encode() -print("Testing data produced") - -tm = MultiClassGraphTsetlinMachine( - args.number_of_clauses, - args.T, - args.s, - number_of_state_bits = args.number_of_state_bits, - depth=args.depth, - message_size=args.message_size, - message_bits=args.message_bits, - max_included_literals=args.max_included_literals, - double_hashing = args.double_hashing -) - -for i in range(args.epochs): - start_training = time() - tm.fit(graphs_train, Y_train, epochs=1, incremental=True) - stop_training = time() - - start_testing = time() - result_test = 100*(tm.predict(graphs_test) == Y_test).mean() - stop_testing = time() - - result_train = 100*(tm.predict(graphs_train) == Y_train).mean() - print("%d %.2f %.2f %.2f %.2f" % (i, result_train, result_test, stop_training-start_training, stop_testing-start_testing)) -# weights = tm.get_state()[1].reshape(2, -1) -# for i in range(tm.number_of_clauses): -# print("Clause #%d W:(%d %d)" % (i, weights[0,i], weights[1,i]), end=' ') -# l = [] -# for k in range(args.hypervector_size * 2): -# if tm.ta_action(0, i, k): -# if k < args.hypervector_size: -# l.append("x%d" % (k)) -# else: -# l.append("NOT x%d" % (k - args.hypervector_size)) -# for k in range(args.message_size * 2): -# if tm.ta_action(1, i, k): -# if k < args.message_size: -# l.append("c%d" % (k)) -# else: -# l.append("NOT c%d" % (k - args.message_size)) -# print(" AND ".join(l)) -# print(graphs_test.hypervectors) -# print(tm.hypervectors) -# print(graphs_test.edge_type_id) \ No newline at end of file +if __name__ == "__main__": + main(default_args()) \ No newline at end of file diff --git a/examples/applications/products_recommendation/main.sh b/examples/applications/products_recommendation/main.sh new file mode 100644 index 0000000..8c7a22a --- /dev/null +++ b/examples/applications/products_recommendation/main.sh @@ -0,0 +1,16 @@ +echo `date`, Setup the environment ... +set -e # exit if error + +models="graph_tm tm_classifier graph_nn" +dataset_noise_ratios="0.005 0.01 0.02 0.05 0.1 0.2" + +for N in $dataset_noise_ratios; do + echo `date`, Running Graph NN ... + python3 graph_nn.py --dataset_noise_ratio $N + + echo `date`, Running Graph Tsetlin Machine ... + python3 graph_tm.py --dataset_noise_ratio $N + + echo `date`, Running Tsetlin Machine Classifier ... + python3 tm_classifier.py --dataset_noise_ratio $N +done \ No newline at end of file diff --git a/examples/applications/products_recommendation/test.ipynb b/examples/applications/products_recommendation/test.ipynb deleted file mode 100644 index 1465bf1..0000000 --- a/examples/applications/products_recommendation/test.ipynb +++ /dev/null @@ -1,271 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/usr/local/lib/python3.10/dist-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", - " from .autonotebook import tqdm as notebook_tqdm\n", - "usage: ipykernel_launcher.py [-h] [--epochs EPOCHS]\n", - " [--number-of-clauses NUMBER_OF_CLAUSES] [--T T]\n", - " [--s S]\n", - " [--number-of-state-bits NUMBER_OF_STATE_BITS]\n", - " [--depth DEPTH]\n", - " [--hypervector-size HYPERVECTOR_SIZE]\n", - " [--hypervector-bits HYPERVECTOR_BITS]\n", - " [--message-size MESSAGE_SIZE]\n", - " [--message-bits MESSAGE_BITS] [--double-hashing]\n", - " [--noise NOISE]\n", - " [--max-included-literals MAX_INCLUDED_LITERALS]\n", - "ipykernel_launcher.py: error: unrecognized arguments: --f=/root/.local/share/jupyter/runtime/kernel-v306f6e67794e909fd94dbef768cafee2e613728cc.json\n" - ] - }, - { - "ename": "SystemExit", - "evalue": "2", - "output_type": "error", - "traceback": [ - "An exception has occurred, use %tb to see the full traceback.\n", - "\u001b[0;31mSystemExit\u001b[0m\u001b[0;31m:\u001b[0m 2\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/root/.local/lib/python3.10/site-packages/IPython/core/interactiveshell.py:3585: UserWarning: To exit: use 'exit', 'quit', or Ctrl-D.\n", - " warn(\"To exit: use 'exit', 'quit', or Ctrl-D.\", stacklevel=1)\n" - ] - } - ], - "source": [ - "from GraphTsetlinMachine.graphs import Graphs\n", - "from GraphTsetlinMachine.tm import MultiClassGraphTsetlinMachine\n", - "from time import time\n", - "import argparse\n", - "import pandas as pd\n", - "import numpy as np\n", - "import kagglehub\n", - "from sklearn.model_selection import train_test_split\n", - "from sklearn.preprocessing import LabelEncoder\n", - "\n", - "def default_args(**kwargs):\n", - " parser = argparse.ArgumentParser()\n", - " parser.add_argument(\"--epochs\", default=250, type=int)\n", - " parser.add_argument(\"--number-of-clauses\", default=10000, type=int)\n", - " parser.add_argument(\"--T\", default=10000, type=int)\n", - " parser.add_argument(\"--s\", default=10.0, type=float)\n", - " parser.add_argument(\"--number-of-state-bits\", default=8, type=int)\n", - " parser.add_argument(\"--depth\", default=1, type=int)\n", - " parser.add_argument(\"--hypervector-size\", default=4096, type=int)\n", - " parser.add_argument(\"--hypervector-bits\", default=256, type=int)\n", - " parser.add_argument(\"--message-size\", default=4096, type=int)\n", - " parser.add_argument(\"--message-bits\", default=256, type=int)\n", - " parser.add_argument('--double-hashing', dest='double_hashing', default=False, action='store_true')\n", - " parser.add_argument(\"--noise\", default=0.01, type=float)\n", - " parser.add_argument(\"--max-included-literals\", default=10, type=int)\n", - "\n", - " args = parser.parse_args()\n", - " for key, value in kwargs.items():\n", - " if key in args.__dict__:\n", - " setattr(args, key, value)\n", - " return args\n", - "\n", - "args = default_args()\n", - "\n", - "############################# real dataset ########################\n", - "\n", - "print(\"Creating training data\")\n", - "path = kagglehub.dataset_download(\"karkavelrajaj/amazon-sales-dataset\")\n", - "print(\"Path to dataset files:\", path)\n", - "data_file = path + \"/amazon.csv\" \n", - "org_data = pd.read_csv(data_file)\n", - "# print(\"Data preview:\", data.head())\n", - "org_data = org_data[['product_id', 'category', 'user_id', 'rating']]\n", - "#################################### expanded \n", - "org_data['rating'] = pd.to_numeric(org_data['rating'], errors='coerce') # Coerce invalid values to NaN\n", - "org_data.dropna(subset=['rating'], inplace=True) # Drop rows with NaN ratings\n", - "org_data['rating'] = org_data['rating'].astype(int)\n", - "# Expand the dataset 10 times\n", - "data = pd.concat([org_data] * 10, ignore_index=True)\n", - "\n", - "# Shuffle the expanded dataset\n", - "data = data.sample(frac=1, random_state=42).reset_index(drop=True)\n", - "\n", - "# Add noise\n", - "# Define the noise ratio\n", - "noise_ratio = 0.1 # 10% noise\n", - "\n", - "# Select rows to apply noise\n", - "num_noisy_rows = int(noise_ratio * len(data))\n", - "noisy_indices = np.random.choice(data.index, size=num_noisy_rows, replace=False)\n", - "\n", - "# Add noise to ratings\n", - "data.loc[noisy_indices, 'rating'] = np.random.choice(range(1, 6), size=num_noisy_rows)\n", - "\n", - "# Add noise to categories\n", - "unique_categories = data['category'].unique()\n", - "data.loc[noisy_indices, 'category'] = np.random.choice(unique_categories, size=num_noisy_rows)\n", - "\n", - "# Print a preview of the noisy and expanded dataset\n", - "print(\"Original data shape:\", org_data.shape)\n", - "print(\"Expanded data shape:\", data.shape)\n", - "print(\"Data preview:\\n\", data.head())\n", - "\n", - "print(data.head())\n", - " \n", - "le_user = LabelEncoder()\n", - "le_item = LabelEncoder()\n", - "le_category = LabelEncoder()\n", - "le_rating = LabelEncoder() \n", - "\n", - "data['user_id'] = le_user.fit_transform(data['user_id'])\n", - "data['product_id'] = le_item.fit_transform(data['product_id'])\n", - "data['category'] = le_category.fit_transform(data['category'])\n", - "data['rating'] = le_rating.fit_transform(data['rating'])\n", - "\n", - "x = data[['user_id', 'product_id', 'category']].values \n", - "y = data['rating'].values \n", - "\n", - "X_train, X_test, Y_train, Y_test = train_test_split(x, y, test_size=0.2, random_state=42)\n", - "\n", - "print(\"X_train shape:\", X_train.shape)\n", - "print(\"y_train shape:\", Y_train.shape)\n", - "print(\"X_test shape:\", X_test.shape)\n", - "print(\"y_test shape:\", Y_test.shape)\n", - "\n", - "users = data['user_id'].unique()\n", - "items = data['product_id'].unique()\n", - "categories = data['category'].unique()\n", - "\n", - "# Initialize Graphs with symbols for GTM\n", - "number_of_nodes = 3\n", - "symbols = []\n", - "symbols = [\"U_\" + str(u) for u in users] + [\"I_\" + str(i) for i in items] + [\"C_\" + str(c) for c in categories] \n", - "print(len(symbols))\n", - "# Train data\n", - "graphs_train = Graphs(\n", - " X_train.shape[0],\n", - " symbols=symbols,\n", - " hypervector_size=args.hypervector_size,\n", - " hypervector_bits=args.hypervector_bits,\n", - " double_hashing = args.double_hashing\n", - ")\n", - "for graph_id in range(X_train.shape[0]):\n", - " graphs_train.set_number_of_graph_nodes(graph_id, number_of_nodes)\n", - "graphs_train.prepare_node_configuration()\n", - "for graph_id in range(X_train.shape[0]):\n", - " for node_id in range(graphs_train.number_of_graph_nodes[graph_id]):\n", - " number_of_edges = 2 if node_id > 0 and node_id < graphs_train.number_of_graph_nodes[graph_id]-1 else 1\n", - " if node_id == 0:\n", - " graphs_train.add_graph_node(graph_id, \"User\", number_of_edges)\n", - " elif node_id == 1:\n", - " graphs_train.add_graph_node(graph_id, \"Item\", number_of_edges)\n", - " else:\n", - " graphs_train.add_graph_node(graph_id, \"Category\", number_of_edges)\n", - "graphs_train.prepare_edge_configuration()\n", - "for graph_id in range(X_train.shape[0]):\n", - " for node_id in range(graphs_train.number_of_graph_nodes[graph_id]):\n", - " if node_id == 0:\n", - " graphs_train.add_graph_node_edge(graph_id, \"User\", \"Item\", \"UserItem\")\n", - " \n", - " if node_id == 1:\n", - " graphs_train.add_graph_node_edge(graph_id, \"Item\", \"Category\", \"ItemCategory\")\n", - " graphs_train.add_graph_node_edge(graph_id, \"Item\", \"User\", \"ItemUser\")\n", - " \n", - " if node_id == 2:\n", - " graphs_train.add_graph_node_edge(graph_id, \"Category\", \"Item\", \"CatrgoryItem\")\n", - "\n", - " graphs_train.add_graph_node_property(graph_id, \"User\", \"U_\" + str(X_train[graph_id][0]))\n", - " graphs_train.add_graph_node_property(graph_id, \"Item\", \"I_\" + str(X_train[graph_id][1]))\n", - " graphs_train.add_graph_node_property(graph_id, \"Category\", \"C_\" + str(X_train[graph_id][2]))\n", - "graphs_train.encode()\n", - "print(\"Training data produced\")\n", - "\n", - "# Test data\n", - "graphs_test = Graphs(X_test.shape[0], init_with=graphs_train)\n", - "for graph_id in range(X_test.shape[0]):\n", - " graphs_test.set_number_of_graph_nodes(graph_id, number_of_nodes)\n", - "graphs_test.prepare_node_configuration()\n", - "for graph_id in range(X_test.shape[0]):\n", - " for node_id in range(graphs_test.number_of_graph_nodes[graph_id]):\n", - " number_of_edges = 2 if node_id > 0 and node_id < graphs_test.number_of_graph_nodes[graph_id]-1 else 1\n", - " if node_id == 0:\n", - " graphs_test.add_graph_node(graph_id, \"User\", number_of_edges)\n", - " elif node_id == 1:\n", - " graphs_test.add_graph_node(graph_id, \"Item\", number_of_edges)\n", - " else:\n", - " graphs_test.add_graph_node(graph_id, \"Category\", number_of_edges)\n", - "graphs_test.prepare_edge_configuration()\n", - "for graph_id in range(X_test.shape[0]):\n", - " for node_id in range(graphs_test.number_of_graph_nodes[graph_id]):\n", - " if node_id == 0:\n", - " graphs_test.add_graph_node_edge(graph_id, \"User\", \"Item\", \"UserItem\")\n", - " \n", - " if node_id == 1:\n", - " graphs_test.add_graph_node_edge(graph_id, \"Item\", \"Category\", \"ItemCategory\")\n", - " graphs_test.add_graph_node_edge(graph_id, \"Item\", \"User\", \"ItemUser\")\n", - " \n", - " if node_id == 2:\n", - " graphs_test.add_graph_node_edge(graph_id, \"Category\", \"Item\", \"CatrgoryItem\")\n", - "\n", - " graphs_test.add_graph_node_property(graph_id, \"User\", \"U_\" + str(X_test[graph_id][0]))\n", - " graphs_test.add_graph_node_property(graph_id, \"Item\", \"I_\" + str(X_test[graph_id][1]))\n", - " graphs_test.add_graph_node_property(graph_id, \"Category\", \"C_\" + str(X_test[graph_id][2]))\n", - "graphs_test.encode()\n", - "print(\"Testing data produced\")\n", - "\n", - "tm = MultiClassGraphTsetlinMachine(\n", - " args.number_of_clauses,\n", - " args.T,\n", - " args.s,\n", - " number_of_state_bits = args.number_of_state_bits,\n", - " depth=args.depth,\n", - " message_size=args.message_size,\n", - " message_bits=args.message_bits,\n", - " max_included_literals=args.max_included_literals,\n", - " double_hashing = args.double_hashing\n", - ")\n", - "\n", - "for i in range(args.epochs):\n", - " start_training = time()\n", - " tm.fit(graphs_train, Y_train, epochs=1, incremental=True)\n", - " stop_training = time()\n", - "\n", - " start_testing = time()\n", - " result_test = 100*(tm.predict(graphs_test) == Y_test).mean()\n", - " stop_testing = time()\n", - "\n", - " result_train = 100*(tm.predict(graphs_train) == Y_train).mean()\n", - "\n", - " print(\"%d %.2f %.2f %.2f %.2f\" % (i, result_train, result_test, stop_training-start_training, stop_testing-start_testing))" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.12" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/examples/applications/products_recommendation/tm_classifier.py b/examples/applications/products_recommendation/tm_classifier.py index b390764..1a2928d 100644 --- a/examples/applications/products_recommendation/tm_classifier.py +++ b/examples/applications/products_recommendation/tm_classifier.py @@ -1,60 +1,59 @@ -import logging import argparse from tmu.models.classification.vanilla_classifier import TMClassifier from tmu.tools import BenchmarkTimer -from tmu.util.cuda_profiler import CudaProfiler import prepare_dataset +import pandas as pd +import os -_LOGGER = logging.getLogger(__name__) - -def metrics(args): - return dict( - accuracy=[], - train_time=[], - test_time=[], - args=vars(args) - ) - -def main(args): - experiment_results = metrics(args) - data = prepare_dataset.aug_amazon_products() +def main(args): + results = [] + data = prepare_dataset.aug_amazon_products(noise_ratio = args.dataset_noise_ratio) x, y = prepare_dataset.construct_x_y(data) X_train, X_test, Y_train, Y_test = prepare_dataset.one_hot_encoding(x,y) - tm = TMClassifier( number_of_clauses=args.num_clauses, T=args.T, s=args.s, max_included_literals=args.max_included_literals, platform=args.platform, - weighted_clauses=args.weighted_clauses + weighted_clauses=args.weighted_clauses, ) - _LOGGER.info(f"Running {TMClassifier} for {args.epochs}") + for epoch in range(args.epochs): benchmark_total = BenchmarkTimer(logger=None, text="Epoch Time") with benchmark_total: benchmark1 = BenchmarkTimer(logger=None, text="Training Time") with benchmark1: - res = tm.fit( - X_train, - Y_train, - ) - - experiment_results["train_time"].append(benchmark1.elapsed()) + tm.fit(X_train, Y_train) + train_time = benchmark1.elapsed() benchmark2 = BenchmarkTimer(logger=None, text="Testing Time") with benchmark2: - result = 100 * (tm.predict(X_test) == Y_test).mean() - experiment_results["accuracy"].append(result) - experiment_results["test_time"].append(benchmark2.elapsed()) - - _LOGGER.info(f"Epoch: {epoch + 1}, Accuracy: {result:.2f}, Training Time: {benchmark1.elapsed():.2f}s, " - f"Testing Time: {benchmark2.elapsed():.2f}s") - - if args.platform == "CUDA": - CudaProfiler().print_timings(benchmark=benchmark_total) - - return experiment_results + accuracy = 100 * (tm.predict(X_test) == Y_test).mean() + test_time = benchmark2.elapsed() + total_time = benchmark_total.elapsed() + + # Append results for each epoch + results.append({ + "Algorithm": "TMClassifier", + "Noise_Ratio": args.dataset_noise_ratio, + "T": args.T, + "s": args.s, + "Max_Included_Literals": args.max_included_literals, + "Epochs": args.epochs, + "Platform": args.platform, + "Total_Time": total_time, + "Accuracy": accuracy, + }) + # Save results to CSV + results_df = pd.DataFrame(results) + results_file = "experiment_results.csv" + if os.path.exists(results_file): + results_df.to_csv(results_file, mode='a', index=False, header=False) + else: + results_df.to_csv(results_file, index=False) + print(f"Results saved to {results_file}") + def default_args(**kwargs): parser = argparse.ArgumentParser() parser.add_argument("--num_clauses", default=2000, type=int) @@ -64,6 +63,7 @@ def default_args(**kwargs): parser.add_argument("--platform", default="CPU_sparse", type=str, choices=["CPU", "CPU_sparse", "CUDA"]) parser.add_argument("--weighted_clauses", default=True, type=bool) parser.add_argument("--epochs", default=10, type=int) + parser.add_argument("--dataset_noise_ratio", default=0.01, type=float) args = parser.parse_args() for key, value in kwargs.items(): if key in args.__dict__: @@ -71,5 +71,4 @@ def default_args(**kwargs): return args if __name__ == "__main__": - results = main(default_args()) - _LOGGER.info(results) \ No newline at end of file + main(default_args()) \ No newline at end of file From 84d8012259f3a253f102b6321f508a1da474743d Mon Sep 17 00:00:00 2001 From: Ahmed Khalid Date: Wed, 25 Dec 2024 15:40:00 +0000 Subject: [PATCH 21/22] add results --- .../experiment_results.csv | 6 ------ examples/recomm_system/README.md | 2 ++ .../prepare_dataset.cpython-310.pyc | Bin examples/recomm_system/experiment_results.csv | 19 ++++++++++++++++++ .../graph_nn.py | 0 .../graph_tm.py | 0 .../main.sh | 0 .../prepare_dataset.py | 0 .../tm_classifier.py | 0 9 files changed, 21 insertions(+), 6 deletions(-) delete mode 100644 examples/applications/products_recommendation/experiment_results.csv create mode 100644 examples/recomm_system/README.md rename examples/{applications/products_recommendation => recomm_system}/__pycache__/prepare_dataset.cpython-310.pyc (100%) create mode 100644 examples/recomm_system/experiment_results.csv rename examples/{applications/products_recommendation => recomm_system}/graph_nn.py (100%) rename examples/{applications/products_recommendation => recomm_system}/graph_tm.py (100%) rename examples/{applications/products_recommendation => recomm_system}/main.sh (100%) rename examples/{applications/products_recommendation => recomm_system}/prepare_dataset.py (100%) rename examples/{applications/products_recommendation => recomm_system}/tm_classifier.py (100%) diff --git a/examples/applications/products_recommendation/experiment_results.csv b/examples/applications/products_recommendation/experiment_results.csv deleted file mode 100644 index d3f66d2..0000000 --- a/examples/applications/products_recommendation/experiment_results.csv +++ /dev/null @@ -1,6 +0,0 @@ -Algorithm,Noise_Ratio,T,s,Max_Included_Literals,Epochs,Platform,Total_Time,Accuracy -Graph NN,0.005,0,0,0,1000,CPU,0.03006434440612793,76.72131061553955 -GraphTM,0.005,10000,10.0,23,10,CUDA,34.547648191452026,98.46994535519126 -TMClassifier,0.005,10000,10.0,32,10,CPU_sparse,89.6943154335022,76.63934426229508 -Graph NN,0.01,0,0,0,1000,CPU,0.01817464828491211,75.95628499984741 -GraphTM,0.01,10000,10.0,23,10,CUDA,34.95576763153076,98.44262295081967 diff --git a/examples/recomm_system/README.md b/examples/recomm_system/README.md new file mode 100644 index 0000000..e7fa211 --- /dev/null +++ b/examples/recomm_system/README.md @@ -0,0 +1,2 @@ +cd examples/recomm_system/ +bash main.sh \ No newline at end of file diff --git a/examples/applications/products_recommendation/__pycache__/prepare_dataset.cpython-310.pyc b/examples/recomm_system/__pycache__/prepare_dataset.cpython-310.pyc similarity index 100% rename from examples/applications/products_recommendation/__pycache__/prepare_dataset.cpython-310.pyc rename to examples/recomm_system/__pycache__/prepare_dataset.cpython-310.pyc diff --git a/examples/recomm_system/experiment_results.csv b/examples/recomm_system/experiment_results.csv new file mode 100644 index 0000000..cb6e80f --- /dev/null +++ b/examples/recomm_system/experiment_results.csv @@ -0,0 +1,19 @@ +Algorithm,Noise_Ratio,T,s,Max_Included_Literals,Epochs,Platform,Total_Time,Accuracy +Graph NN,0.005,0,0,0,1000,CPU,0.03006434440612793,76.72131061553955 +GraphTM,0.005,10000,10.0,23,10,CUDA,34.547648191452026,98.46994535519126 +TMClassifier,0.005,10000,10.0,32,10,CPU_sparse,89.6943154335022,76.63934426229508 +Graph NN,0.01,0,0,0,1000,CPU,0.01817464828491211,75.95628499984741 +GraphTM,0.01,10000,10.0,23,10,CUDA,34.95576763153076,98.44262295081967 +TMClassifier,0.01,10000,10.0,32,10,CPU_sparse,96.10501098632812,74.93169398907104 +Graph NN,0.02,0,0,0,1000,CPU,0.03073263168334961,81.22950792312622 +GraphTM,0.02,10000,10.0,23,10,CUDA,36.0724892616272,97.43169398907104 +TMClassifier,0.02,10000,10.0,32,10,CPU_sparse,95.67133641242981,72.40437158469946 +Graph NN,0.05,0,0,0,1000,CPU,0.014258623123168945,83.52459073066711 +GraphTM,0.05,10000,10.0,23,10,CUDA,38.86628317832947,95.0 +TMClassifier,0.05,10000,10.0,32,10,CPU_sparse,96.7427487373352,64.65163934426229 +Graph NN,0.1,0,0,0,1000,CPU,0.022305965423583984,73.33333492279053 +GraphTM,0.1,10000,10.0,23,10,CUDA,37.45086216926575,90.08196721311475 +TMClassifier,0.1,10000,10.0,32,10,CPU_sparse,90.45554423332214,49.8292349726776 +Graph NN,0.2,0,0,0,1000,CPU,0.03204679489135742,59.863388538360596 +GraphTM,0.2,10000,10.0,23,10,CUDA,16.268279790878296,78.77049180327869 +TMClassifier,0.2,10000,10.0,32,10,CPU_sparse,96.16712856292725,20.184426229508194 diff --git a/examples/applications/products_recommendation/graph_nn.py b/examples/recomm_system/graph_nn.py similarity index 100% rename from examples/applications/products_recommendation/graph_nn.py rename to examples/recomm_system/graph_nn.py diff --git a/examples/applications/products_recommendation/graph_tm.py b/examples/recomm_system/graph_tm.py similarity index 100% rename from examples/applications/products_recommendation/graph_tm.py rename to examples/recomm_system/graph_tm.py diff --git a/examples/applications/products_recommendation/main.sh b/examples/recomm_system/main.sh similarity index 100% rename from examples/applications/products_recommendation/main.sh rename to examples/recomm_system/main.sh diff --git a/examples/applications/products_recommendation/prepare_dataset.py b/examples/recomm_system/prepare_dataset.py similarity index 100% rename from examples/applications/products_recommendation/prepare_dataset.py rename to examples/recomm_system/prepare_dataset.py diff --git a/examples/applications/products_recommendation/tm_classifier.py b/examples/recomm_system/tm_classifier.py similarity index 100% rename from examples/applications/products_recommendation/tm_classifier.py rename to examples/recomm_system/tm_classifier.py From d68ae7153845b1ed2f09ebf2b2726a9e21444b99 Mon Sep 17 00:00:00 2001 From: Ahmed Khalid Date: Thu, 26 Dec 2024 16:01:53 +0000 Subject: [PATCH 22/22] fair comparisons --- examples/recomm_system/experiment_results.csv | 36 +++++++++++++++++++ examples/recomm_system/graph_nn.py | 12 +++---- examples/recomm_system/graph_tm.py | 10 +++--- examples/recomm_system/tm_classifier.py | 10 +++--- 4 files changed, 52 insertions(+), 16 deletions(-) diff --git a/examples/recomm_system/experiment_results.csv b/examples/recomm_system/experiment_results.csv index cb6e80f..957f770 100644 --- a/examples/recomm_system/experiment_results.csv +++ b/examples/recomm_system/experiment_results.csv @@ -17,3 +17,39 @@ TMClassifier,0.1,10000,10.0,32,10,CPU_sparse,90.45554423332214,49.8292349726776 Graph NN,0.2,0,0,0,1000,CPU,0.03204679489135742,59.863388538360596 GraphTM,0.2,10000,10.0,23,10,CUDA,16.268279790878296,78.77049180327869 TMClassifier,0.2,10000,10.0,32,10,CPU_sparse,96.16712856292725,20.184426229508194 +Graph NN,0.005,0,0,0,1000,CPU,0.0168764591217041,76.85792446136475 +GraphTM,0.005,10000,10.0,23,10,CUDA,31.40691065788269,98.82513661202185 +TMClassifier,0.005,10000,10.0,32,10,CPU_sparse,88.05298614501953,76.74180327868852 +Graph NN,0.01,0,0,0,1000,CPU,0.01720118522644043,87.4316930770874 +GraphTM,0.01,10000,10.0,23,10,CUDA,31.529547214508057,98.4153005464481 +TMClassifier,0.01,10000,10.0,32,10,CPU_sparse,89.19472336769104,74.93169398907104 +Graph NN,0.02,0,0,0,1000,CPU,0.014032602310180664,78.36065292358398 +GraphTM,0.02,10000,10.0,23,10,CUDA,32.8007595539093,97.62295081967213 +TMClassifier,0.02,10000,10.0,32,10,CPU_sparse,94.56675243377686,72.6775956284153 +Graph NN,0.05,0,0,0,1000,CPU,0.016784191131591797,76.88524723052979 +GraphTM,0.05,10000,10.0,23,10,CUDA,34.84256434440613,94.75409836065573 +TMClassifier,0.05,10000,10.0,32,10,CPU_sparse,96.4975814819336,64.1051912568306 +Graph NN,0.1,0,0,0,1000,CPU,0.014883041381835938,70.54644823074341 +GraphTM,0.1,10000,10.0,23,10,CUDA,36.750433683395386,89.97267759562841 +TMClassifier,0.1,10000,10.0,32,10,CPU_sparse,96.35110449790955,50.341530054644814 +Graph NN,0.2,0,0,0,1000,CPU,0.03427433967590332,61.50273084640503 +GraphTM,0.2,10000,10.0,23,10,CUDA,39.63756251335144,79.01639344262294 +TMClassifier,0.2,10000,10.0,32,10,CPU_sparse,97.00698733329773,20.116120218579233 +Graph NN,0.005,0,0,0,20000,CPU,370.7295939922333,87.5683069229126 +GraphTM,0.005,10000,10.0,23,10,CUDA,342.7878243923187,98.82513661202185 +TMClassifier,0.005,10000,10.0,32,10,CPU_sparse,954.4101324081421,76.63934426229508 +Graph NN,0.01,0,0,0,20000,CPU,304.6031119823456,86.74863576889038 +GraphTM,0.01,10000,10.0,23,10,CUDA,346.8704605102539,98.25136612021858 +TMClassifier,0.01,10000,10.0,32,10,CPU_sparse,978.3629264831543,74.93169398907104 +Graph NN,0.02,0,0,0,20000,CPU,403.2585175037384,75.30054450035095 +GraphTM,0.02,10000,10.0,23,10,CUDA,353.39254236221313,97.65027322404372 +TMClassifier,0.02,10000,10.0,32,10,CPU_sparse,971.3300836086273,72.1311475409836 +Graph NN,0.05,0,0,0,20000,CPU,398.8085067272186,93.8524603843689 +GraphTM,0.05,10000,10.0,23,10,CUDA,368.16111874580383,94.59016393442623 +TMClassifier,0.05,10000,10.0,32,10,CPU_sparse,960.4506890773773,63.661202185792355 +Graph NN,0.1,0,0,0,20000,CPU,388.4886665344238,75.43715834617615 +GraphTM,0.1,10000,10.0,23,10,CUDA,340.63327074050903,90.43715846994536 +TMClassifier,0.1,10000,10.0,32,10,CPU_sparse,972.1077370643616,49.35109289617486 +Graph NN,0.2,0,0,0,20000,CPU,438.5506749153137,64.04371857643127 +GraphTM,0.2,10000,10.0,23,10,CUDA,357.2651107311249,77.89617486338798 +TMClassifier,0.2,10000,10.0,32,10,CPU_sparse,948.7157049179077,20.116120218579233 diff --git a/examples/recomm_system/graph_nn.py b/examples/recomm_system/graph_nn.py index 30292db..9ef5fbe 100644 --- a/examples/recomm_system/graph_nn.py +++ b/examples/recomm_system/graph_nn.py @@ -61,9 +61,9 @@ def forward(self, x, edge_index): ).t() test_labels = torch.tensor(Y_test, dtype=torch.float) # Training Loop with Accuracy Logging - for epoch in range(args.epochs): - benchmark_total = BenchmarkTimer(logger=None, text="Epoch Time") - with benchmark_total: + benchmark_total = BenchmarkTimer(logger=None, text="Epochs Time") + with benchmark_total: + for epoch in range(args.epochs): benchmark1 = BenchmarkTimer(logger=None, text="Training Time") with benchmark1: # Training Phase @@ -91,8 +91,8 @@ def forward(self, x, edge_index): # Compute accuracy accuracy = ((test_predicted_ratings.round() == test_labels).float().mean().item()) * 100 test_time = benchmark2.elapsed() - total_time = benchmark_total.elapsed() - # Append results for each epoch + total_time = benchmark_total.elapsed() + # Append results for each epoch results.append({ "Algorithm": "Graph NN", "Noise_Ratio": args.dataset_noise_ratio, @@ -118,7 +118,7 @@ def forward(self, x, edge_index): def default_args(**kwargs): parser = argparse.ArgumentParser() parser.add_argument("--platform", default="CPU", type=str, choices=["CPU", "CUDA"]) - parser.add_argument("--epochs", default=1000, type=int) + parser.add_argument("--epochs", default=20000, type=int) parser.add_argument("--dataset_noise_ratio", default=0.01, type=float) args = parser.parse_args() for key, value in kwargs.items(): diff --git a/examples/recomm_system/graph_tm.py b/examples/recomm_system/graph_tm.py index 0ec2171..d1464c7 100644 --- a/examples/recomm_system/graph_tm.py +++ b/examples/recomm_system/graph_tm.py @@ -106,9 +106,9 @@ def main(args): double_hashing = args.double_hashing ) - for epoch in range(args.epochs): - benchmark_total = BenchmarkTimer(logger=None, text="Epoch Time") - with benchmark_total: + benchmark_total = BenchmarkTimer(logger=None, text="Epoch Time") + with benchmark_total: + for epoch in range(args.epochs): benchmark1 = BenchmarkTimer(logger=None, text="Training Time") with benchmark1: tm.fit(graphs_train, Y_train, epochs=1, incremental=True) @@ -118,8 +118,8 @@ def main(args): with benchmark2: accuracy = 100*(tm.predict(graphs_test) == Y_test).mean() test_time = benchmark2.elapsed() - total_time = benchmark_total.elapsed() - # result_train = 100*(tm.predict(graphs_train) == Y_train).mean() + total_time = benchmark_total.elapsed() + # result_train = 100*(tm.predict(graphs_train) == Y_train).mean() results.append({ "Algorithm": "GraphTM", "Noise_Ratio": args.dataset_noise_ratio, diff --git a/examples/recomm_system/tm_classifier.py b/examples/recomm_system/tm_classifier.py index 1a2928d..876f8c4 100644 --- a/examples/recomm_system/tm_classifier.py +++ b/examples/recomm_system/tm_classifier.py @@ -19,9 +19,9 @@ def main(args): weighted_clauses=args.weighted_clauses, ) - for epoch in range(args.epochs): - benchmark_total = BenchmarkTimer(logger=None, text="Epoch Time") - with benchmark_total: + benchmark_total = BenchmarkTimer(logger=None, text="Epoch Time") + with benchmark_total: + for epoch in range(args.epochs): benchmark1 = BenchmarkTimer(logger=None, text="Training Time") with benchmark1: tm.fit(X_train, Y_train) @@ -30,9 +30,9 @@ def main(args): with benchmark2: accuracy = 100 * (tm.predict(X_test) == Y_test).mean() test_time = benchmark2.elapsed() - total_time = benchmark_total.elapsed() + total_time = benchmark_total.elapsed() - # Append results for each epoch + # Append results for each epoch results.append({ "Algorithm": "TMClassifier", "Noise_Ratio": args.dataset_noise_ratio,