Skip to content

Commit

Permalink
Create datalake-demo Ref Arch (#682)
Browse files Browse the repository at this point in the history
* Update mysql, dms and s3 for data lake demo

* First version - dms

* Update mysql endpoint

* Create peering between datascience and apps-devstg

* Fix account name in  secrets manager layer -  apps-devstg account

* Grant rds-data access to DevOps

* Create datalake - demo

* add comment in aurora-postgres-devstg

* Remove comments and run terraform fmt

* Update endpoint name in provider
  • Loading branch information
martingaleano authored Dec 27, 2024
1 parent 6ed42ee commit fb67b8a
Show file tree
Hide file tree
Showing 16 changed files with 610 additions and 77 deletions.
16 changes: 12 additions & 4 deletions apps-devstg/us-east-1/databases-aurora-pgsql --/cluster_aurora.tf
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ module "apps_devstg_aurora_postgresql" {
name = local.name
engine = local.engine
engine_mode = "provisioned"
engine_version = "14.5"
engine_version = "14.8"

# Initial database and credentials
database_name = "demoapps"
Expand Down Expand Up @@ -62,8 +62,15 @@ module "apps_devstg_aurora_postgresql" {
# enabled_cloudwatch_logs_exports = ["audit", "error", "general", "slowquery"]

# Database parameters: you can specify your own if you must
# db_parameter_group_name = aws_db_parameter_group.aurora_db_57_parameter_group.id
# db_cluster_parameter_group_name = aws_rds_cluster_parameter_group.aurora_57_cluster_parameter_group.id
#db_parameter_group_name = aws_db_parameter_group.aurora_db_57_parameter_group.id
#db_cluster_parameter_group_name = aws_rds_cluster_parameter_group.aurora_57_cluster_parameter_group.id
create_db_cluster_parameter_group = true
db_cluster_parameter_group_family = "aurora-postgresql14"
db_cluster_parameter_group_parameters = [{
name = "rds.logical_replication"
value = "1"
apply_method = "pending-reboot"
}]

# If true, must add policy to iam auth (user or role)
iam_database_authentication_enabled = false
Expand All @@ -72,7 +79,8 @@ module "apps_devstg_aurora_postgresql" {
create_security_group = true
allowed_cidr_blocks = [
data.terraform_remote_state.vpc.outputs.vpc_cidr_block,
data.terraform_remote_state.shared-vpc.outputs.vpc_cidr_block
data.terraform_remote_state.shared-vpc.outputs.vpc_cidr_block,
data.terraform_remote_state.datascience-vpc.outputs.vpc_cidr_block
]

tags = local.tags
Expand Down
14 changes: 13 additions & 1 deletion apps-devstg/us-east-1/databases-aurora-pgsql --/config.tf
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ data "terraform_remote_state" "vpc" {
region = var.region
profile = var.profile
bucket = var.bucket
key = "${var.environment}/network/terraform.tfstate"
key = "${var.environment}/k8s-eks/network/terraform.tfstate" # Use k8s-vpc to avoid network overlapping
}
}

Expand All @@ -60,3 +60,15 @@ data "terraform_remote_state" "shared-vpc" {
key = "shared/network/terraform.tfstate"
}
}

data "terraform_remote_state" "datascience-vpc" {
backend = "s3"

config = {
region = var.region
profile = "${var.project}-data-science-devops"
bucket = "${var.project}-data-science-terraform-backend"
key = "data-science/network/terraform.tfstate"
}
}

5 changes: 5 additions & 0 deletions apps-devstg/us-east-1/databases-aurora-pgsql --/outputs.tf
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,11 @@ output "cluster_id" {
value = module.apps_devstg_aurora_postgresql.cluster_id
}

output "cluster_arn" {
description = "The ID of the cluster"
value = module.apps_devstg_aurora_postgresql.cluster_arn
}

output "cluster_resource_id" {
description = "The Resource ID of the cluster"
value = module.apps_devstg_aurora_postgresql.cluster_resource_id
Expand Down
8 changes: 7 additions & 1 deletion data-science/us-east-1/base-network/locals.tf
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,13 @@ locals {
profile = "${var.project}-apps-devstg-devops"
bucket = "${var.project}-apps-devstg-terraform-backend"
key = "apps-devstg/network/terraform.tfstate"
}
}
apps-devstg-k8s-eks = {
region = var.region
profile = "${var.project}-apps-devstg-devops"
bucket = "${var.project}-apps-devstg-terraform-backend"
key = "apps-devstg/k8s-eks/network/terraform.tfstate"
}
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ data "aws_secretsmanager_secret_version" "administrator" {
}

module "demoapps" {
source = "github.com/binbashar/terraform-aws-rds-aurora.git?ref=v7.2.2"
source = "github.com/binbashar/terraform-aws-rds-aurora.git?ref=v7.3.0"

# General settings
name = "${var.project}-${var.environment}-binbash-aurora-mysql"
Expand All @@ -20,7 +20,7 @@ module "demoapps" {

# Initial database and credentials
database_name = "demoapps"
master_username = "admin"
master_username = jsondecode(data.aws_secretsmanager_secret_version.administrator.secret_string)["username"]
master_password = jsondecode(data.aws_secretsmanager_secret_version.administrator.secret_string)["password"]
create_random_password = false

Expand Down Expand Up @@ -62,6 +62,13 @@ module "demoapps" {
# Database parameters: you can specify your own if you must
# db_parameter_group_name = aws_db_parameter_group.aurora_db_57_parameter_group.id
# db_cluster_parameter_group_name = aws_rds_cluster_parameter_group.aurora_57_cluster_parameter_group.id
create_db_cluster_parameter_group = true
db_cluster_parameter_group_family = "aurora-mysql5.7"
db_cluster_parameter_group_parameters = [{
name = "binlog_format"
value = "ROW"
apply_method = "pending-reboot"
}]

# If true, must add policy to iam auth (user or role)
iam_database_authentication_enabled = false
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,5 +24,12 @@ resource "mysql_grant" "sockshop" {
user = mysql_user.sockshop.user
host = mysql_user.sockshop.host
database = mysql_database.sockshop.name
privileges = ["SELECT", "INSERT", "UPDATE", "DELETE", "CREATE", "REFERENCES", "DROP", "REPLICATION SLAVE", "REPLICATION CLIENT" ]
privileges = ["SELECT", "INSERT", "UPDATE", "DELETE", "CREATE", "REFERENCES", "DROP" ]
}

resource "mysql_grant" "sockshop_global" {
user = mysql_user.sockshop.user
host = mysql_user.sockshop.host
privileges = ["REPLICATION SLAVE", "REPLICATION CLIENT"]
database = "*" # Use * for global privileges
}
5 changes: 5 additions & 0 deletions data-science/us-east-1/databases-aurora-mysql--/outputs.tf
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,11 @@ output "cluster_id" {
value = module.demoapps.cluster_id
}

output "cluster_arn" {
description = "The ID of the cluster"
value = module.demoapps.cluster_arn
}

output "cluster_resource_id" {
description = "The Resource ID of the cluster"
value = module.demoapps.cluster_resource_id
Expand Down
28 changes: 28 additions & 0 deletions data-science/us-east-1/datalake-demo--/README.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,33 @@
# binbash Leverage™ Data Lake Reference Architecture

## Data Lake Overview

A **data lake** is a centralized storage repository designed to hold vast amounts of raw data in its native format, whether structured, semi-structured, or unstructured. Unlike traditional data warehouses, which require data to be processed and structured before storage, a data lake provides the flexibility to store data in its original form until it's needed for analysis.

## What Makes a Data Lake Unique?

1. **Data Diversity**:
A data lake can store multiple types of data, including:
- Structured data (e.g., databases, tables).
- Semi-structured data (e.g., JSON, XML).
- Unstructured data (e.g., images, videos, logs).

2. **Scalability and Cost-Efficiency**:
Built on cost-effective storage technologies like cloud-based object storage, data lakes can scale to accommodate petabytes or even exabytes of data.

3. **Flexibility**:
By storing raw data, a data lake supports a "store first, analyze later" approach. Users can explore and analyze data for various use cases, including:
- Business Intelligence (BI).
- Advanced Analytics.
- Machine Learning (ML) and Artificial Intelligence (AI).

4. **Centralized Access**:
A data lake serves as a single source of truth, consolidating data from disparate sources into one platform.

5. **Integration Capabilities**:
It can integrate with tools and services for data processing, transformation, analytics, and visualization.


## Overview

This document provides an overview of the Data Lake architecture depicted in the attached diagram and
Expand Down
38 changes: 38 additions & 0 deletions data-science/us-east-1/datalake-demo--/athena.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
module "glue_catalog_products_orders" {
source = "github.com/binbashar/terraform-aws-glue.git//modules/glue-catalog-table?ref=0.4.0"

catalog_table_name = "products_orders"
catalog_table_description = "Glue Catalog table"
database_name = module.glue_catalog_database.name

storage_descriptor = {
location = format("s3://%s/product_order_summary/", module.s3_bucket_data_processed.s3_bucket_id)
input_format = local.parquet_input_format
output_format = local.parquet_output_format
ser_de_info = {
serialization_library = local.parquet_serialization_library
}
}


}

resource "aws_athena_workgroup" "datalake-workgroup" {
name = "datalake"

configuration {
enforce_workgroup_configuration = true
publish_cloudwatch_metrics_enabled = false

result_configuration {
output_location = "s3://${module.s3_bucket_data_processed.s3_bucket_id}/output/"

encryption_configuration {
encryption_option = "SSE_KMS"
kms_key_arn = data.terraform_remote_state.keys.outputs.aws_kms_key_arn
}
}
}

force_destroy = true
}
40 changes: 36 additions & 4 deletions data-science/us-east-1/datalake-demo--/config.tf
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,15 @@ provider "aws" {
profile = var.profile
}

# endpoint = module.demoapps.cluster_endpoint
# username = module.demoapps.cluster_master_username
# password = jsondecode(data.aws_secretsmanager_secret_version.administrator.secret_string)["password"]
provider "sql" {
alias = "mysql"
url = "mysql://${data.terraform_remote_state.aurora_mysql.outputs.cluster_master_username}:${data.terraform_remote_state.aurora_mysql.outputs.cluster_master_password}@tcp(${data.terraform_remote_state.aurora_mysql.outputs.cluster_endpoint}:3306)/${data.terraform_remote_state.aurora_mysql.outputs.cluster_database_name}"
}

provider "sql" {
alias = "postgres"
url = "postgres://${data.terraform_remote_state.aurora_postgres.outputs.cluster_master_username}:${data.terraform_remote_state.aurora_postgres.outputs.cluster_master_password}@${data.terraform_remote_state.aurora_postgres.outputs.cluster_endpoint}:5432/${data.terraform_remote_state.aurora_postgres.outputs.cluster_database_name}?sslmode=disable"
}

#=============================#
# Backend Config (partial) #
Expand All @@ -17,7 +23,11 @@ terraform {
required_version = "~> 1.3"

required_providers {
aws = "~> 5.0"
aws = "~> 5.0"
sql = {
source = "paultyng/sql"
version = "0.5.0"
}
}

backend "s3" {
Expand Down Expand Up @@ -50,6 +60,17 @@ data "terraform_remote_state" "secrets" {
}
}

data "terraform_remote_state" "secrets_apps_devstg" {
backend = "s3"

config = {
region = var.region
profile = "bb-apps-devstg-devops"
bucket = "bb-apps-devstg-terraform-backend"
key = "apps-devstg/secrets-manager/terraform.tfstate"
}
}

data "terraform_remote_state" "aurora_mysql" {
backend = "s3"

Expand All @@ -61,6 +82,17 @@ data "terraform_remote_state" "aurora_mysql" {
}
}

data "terraform_remote_state" "aurora_postgres" {
backend = "s3"

config = {
region = var.region
profile = "bb-apps-devstg-devops"
bucket = "bb-apps-devstg-terraform-backend"
key = "apps-devstg/databases-aurora-pgsql/terraform.tfstate"
}
}

data "terraform_remote_state" "keys" {
backend = "s3"
config = {
Expand Down
43 changes: 43 additions & 0 deletions data-science/us-east-1/datalake-demo--/config/etl_script.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
from awsglue.context import GlueContext
from awsglue.dynamicframe import DynamicFrame
from pyspark.context import SparkContext
from pyspark.sql import functions as F

# Initialize Glue Context
glueContext = GlueContext(SparkContext.getOrCreate())

# Load raw data from S3 (multiple Parquet files)
products_df = glueContext.create_dynamic_frame.from_options(
connection_type="s3",
connection_options={"paths": ["s3://bb-data-science-data-lake-demo-data-raw-bucket/destinationdata/demoapps/sockshop_products/"]}, # Directory containing multiple Parquet files
format="parquet"
)

orders_df = glueContext.create_dynamic_frame.from_options(
connection_type="s3",
connection_options={"paths": ["s3://bb-data-science-data-lake-demo-data-raw-bucket/destinationdata/public/sockshop_orders/"]}, # Directory containing multiple Parquet files
format="parquet"
)

# Convert to Spark DataFrames for transformation
products = products_df.toDF()
orders = orders_df.toDF()

# Perform a join to combine product and order data
product_orders = products.join(orders, products.product_id == orders.product_id, "inner") \
.select(products.product_id, products.name, products.price, orders.order_id, orders.quantity)

# Add a total_amount column per product
product_summary = product_orders.groupBy("product_id", "name") \
.agg(F.sum("quantity").alias("total_products_sold"))

# Convert back to Glue DynamicFrame
transformed = DynamicFrame.fromDF(product_summary, glueContext, "transformed")

# Write transformed data to S3
glueContext.write_dynamic_frame.from_options(
frame=transformed,
connection_type="s3",
connection_options={"path": "s3://bb-data-science-data-lake-demo-data-processed-bucket/product_order_summary/"},
format="parquet" # Output format can be parquet or csv
)
50 changes: 50 additions & 0 deletions data-science/us-east-1/datalake-demo--/dataset.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
data "sql_query" "mysql_create_table" {
provider = sql.mysql
query = <<EOT
CREATE TABLE IF NOT EXISTS sockshop_products (
product_id INT NOT NULL AUTO_INCREMENT PRIMARY KEY,
name VARCHAR(100),
color VARCHAR(50),
price DECIMAL(10, 2)
);
EOT
}

data "sql_query" "mysql_insert_data" {
provider = sql.mysql
query = <<EOT
INSERT INTO sockshop_products (product_id, name, color, price) VALUES
(null, 'Red Socks', 'Red', 9.99),
(null, 'Blue Socks', 'Blue', 8.99),
(null, 'Green Socks', 'Green', 7.99);
EOT

depends_on = [data.sql_query.mysql_create_table]
}

data "sql_query" "postgres_create_table" {
provider = sql.postgres
query = <<EOT
CREATE TABLE IF NOT EXISTS sockshop_orders (
order_id SERIAL PRIMARY KEY,
product_id INT,
quantity INT,
order_date DATE,
total DECIMAL(10, 2)
);
EOT
}

data "sql_query" "postgres_insert_data" {
provider = sql.postgres
query = <<EOT
INSERT INTO sockshop_orders (product_id, quantity, order_date, total) VALUES
(1, 2, '2024-11-01', 19.98),
(2, 1, '2024-11-02', 8.99),
(3, 3, '2024-11-03', 23.97)
ON CONFLICT (order_id) DO NOTHING;
EOT

depends_on = [data.sql_query.postgres_create_table]
}
Loading

0 comments on commit fb67b8a

Please sign in to comment.