-
-
Notifications
You must be signed in to change notification settings - Fork 8
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Create datalake-demo Ref Arch (#682)
* Update mysql, dms and s3 for data lake demo * First version - dms * Update mysql endpoint * Create peering between datascience and apps-devstg * Fix account name in secrets manager layer - apps-devstg account * Grant rds-data access to DevOps * Create datalake - demo * add comment in aurora-postgres-devstg * Remove comments and run terraform fmt * Update endpoint name in provider
- Loading branch information
1 parent
6ed42ee
commit fb67b8a
Showing
16 changed files
with
610 additions
and
77 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,38 @@ | ||
module "glue_catalog_products_orders" { | ||
source = "github.com/binbashar/terraform-aws-glue.git//modules/glue-catalog-table?ref=0.4.0" | ||
|
||
catalog_table_name = "products_orders" | ||
catalog_table_description = "Glue Catalog table" | ||
database_name = module.glue_catalog_database.name | ||
|
||
storage_descriptor = { | ||
location = format("s3://%s/product_order_summary/", module.s3_bucket_data_processed.s3_bucket_id) | ||
input_format = local.parquet_input_format | ||
output_format = local.parquet_output_format | ||
ser_de_info = { | ||
serialization_library = local.parquet_serialization_library | ||
} | ||
} | ||
|
||
|
||
} | ||
|
||
resource "aws_athena_workgroup" "datalake-workgroup" { | ||
name = "datalake" | ||
|
||
configuration { | ||
enforce_workgroup_configuration = true | ||
publish_cloudwatch_metrics_enabled = false | ||
|
||
result_configuration { | ||
output_location = "s3://${module.s3_bucket_data_processed.s3_bucket_id}/output/" | ||
|
||
encryption_configuration { | ||
encryption_option = "SSE_KMS" | ||
kms_key_arn = data.terraform_remote_state.keys.outputs.aws_kms_key_arn | ||
} | ||
} | ||
} | ||
|
||
force_destroy = true | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
43 changes: 43 additions & 0 deletions
43
data-science/us-east-1/datalake-demo--/config/etl_script.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,43 @@ | ||
from awsglue.context import GlueContext | ||
from awsglue.dynamicframe import DynamicFrame | ||
from pyspark.context import SparkContext | ||
from pyspark.sql import functions as F | ||
|
||
# Initialize Glue Context | ||
glueContext = GlueContext(SparkContext.getOrCreate()) | ||
|
||
# Load raw data from S3 (multiple Parquet files) | ||
products_df = glueContext.create_dynamic_frame.from_options( | ||
connection_type="s3", | ||
connection_options={"paths": ["s3://bb-data-science-data-lake-demo-data-raw-bucket/destinationdata/demoapps/sockshop_products/"]}, # Directory containing multiple Parquet files | ||
format="parquet" | ||
) | ||
|
||
orders_df = glueContext.create_dynamic_frame.from_options( | ||
connection_type="s3", | ||
connection_options={"paths": ["s3://bb-data-science-data-lake-demo-data-raw-bucket/destinationdata/public/sockshop_orders/"]}, # Directory containing multiple Parquet files | ||
format="parquet" | ||
) | ||
|
||
# Convert to Spark DataFrames for transformation | ||
products = products_df.toDF() | ||
orders = orders_df.toDF() | ||
|
||
# Perform a join to combine product and order data | ||
product_orders = products.join(orders, products.product_id == orders.product_id, "inner") \ | ||
.select(products.product_id, products.name, products.price, orders.order_id, orders.quantity) | ||
|
||
# Add a total_amount column per product | ||
product_summary = product_orders.groupBy("product_id", "name") \ | ||
.agg(F.sum("quantity").alias("total_products_sold")) | ||
|
||
# Convert back to Glue DynamicFrame | ||
transformed = DynamicFrame.fromDF(product_summary, glueContext, "transformed") | ||
|
||
# Write transformed data to S3 | ||
glueContext.write_dynamic_frame.from_options( | ||
frame=transformed, | ||
connection_type="s3", | ||
connection_options={"path": "s3://bb-data-science-data-lake-demo-data-processed-bucket/product_order_summary/"}, | ||
format="parquet" # Output format can be parquet or csv | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,50 @@ | ||
data "sql_query" "mysql_create_table" { | ||
provider = sql.mysql | ||
query = <<EOT | ||
CREATE TABLE IF NOT EXISTS sockshop_products ( | ||
product_id INT NOT NULL AUTO_INCREMENT PRIMARY KEY, | ||
name VARCHAR(100), | ||
color VARCHAR(50), | ||
price DECIMAL(10, 2) | ||
); | ||
EOT | ||
} | ||
|
||
data "sql_query" "mysql_insert_data" { | ||
provider = sql.mysql | ||
query = <<EOT | ||
INSERT INTO sockshop_products (product_id, name, color, price) VALUES | ||
(null, 'Red Socks', 'Red', 9.99), | ||
(null, 'Blue Socks', 'Blue', 8.99), | ||
(null, 'Green Socks', 'Green', 7.99); | ||
EOT | ||
|
||
depends_on = [data.sql_query.mysql_create_table] | ||
} | ||
|
||
data "sql_query" "postgres_create_table" { | ||
provider = sql.postgres | ||
query = <<EOT | ||
CREATE TABLE IF NOT EXISTS sockshop_orders ( | ||
order_id SERIAL PRIMARY KEY, | ||
product_id INT, | ||
quantity INT, | ||
order_date DATE, | ||
total DECIMAL(10, 2) | ||
); | ||
EOT | ||
} | ||
|
||
data "sql_query" "postgres_insert_data" { | ||
provider = sql.postgres | ||
query = <<EOT | ||
INSERT INTO sockshop_orders (product_id, quantity, order_date, total) VALUES | ||
(1, 2, '2024-11-01', 19.98), | ||
(2, 1, '2024-11-02', 8.99), | ||
(3, 3, '2024-11-03', 23.97) | ||
ON CONFLICT (order_id) DO NOTHING; | ||
EOT | ||
|
||
depends_on = [data.sql_query.postgres_create_table] | ||
} |
Oops, something went wrong.