-
Notifications
You must be signed in to change notification settings - Fork 0
/
generate_aws_idx.py
95 lines (71 loc) · 2.55 KB
/
generate_aws_idx.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
"""Simple AWS builder."""
from glob import glob
from os import makedirs
from os.path import basename
import json
import pandas as pd
AWS_JSON_DIR = "cloud-json/aws"
AWS_JSON_GLOB = f"{AWS_JSON_DIR}/*.json"
AWS_IDX_DIR = "public/idx/aws"
def get_regions() -> list:
"""Get the regions from the data."""
return [basename(x).split(".")[0] for x in glob(AWS_JSON_GLOB)]
def read_data():
"""Generate a dataframe containing all of the region data."""
print("Reading data from JSON files...")
data_frames = [read_json(region) for region in get_regions()]
df = pd.concat(data_frames, ignore_index=True)
df.sort_values(by=["CreationDate"], inplace=True)
return df
def read_json(region: str) -> pd.DataFrame:
"""Read the json data for a region."""
filename = f"{AWS_JSON_DIR}/{region}.json"
temp_df = pd.read_json(filename, orient="records")
# Create a view of the data.
schema_fields = [
"ImageId",
"OwnerId",
"Name",
"Architecture",
"VirtualizationType",
"CreationDate",
]
region_view = temp_df[schema_fields].copy()
# Add the region to the view.
region_view["Region"] = region
return region_view
def write_owner_ids(df):
"""Split the data frames by OwnerId."""
print("Writing images based on OwnerId")
for owner_id in df.OwnerId.unique():
owner_df = df[df.OwnerId == owner_id]
# Exclude any owners with less than 10 images.
if len(owner_df.index) < 10:
continue
output_dir = f"{AWS_IDX_DIR}/ownerid/{owner_id}"
makedirs(output_dir, exist_ok=True)
owner_df.to_json(f"{output_dir}/index.json", orient="records")
def write_image_ids(df):
"""Split the data frames by ImageID."""
print("Writing images based on ImageId")
counter = 0
# This seems weird, but it's much much faster than letting pandas write the data.
for row in df.to_json(orient="records", lines=True).splitlines():
image_id = json.loads(row)["ImageId"]
output_dir = f"{AWS_IDX_DIR}/imageid/{image_id}"
makedirs(output_dir, exist_ok=True)
with open(f"{output_dir}/index.json", "w") as fileh:
fileh.write(row)
counter += 1
if counter % 250000 == 0:
print(f"Processed {counter/1000}K images...")
def main():
"""Main function."""
df = read_data()
df.info()
print(f"Total rows: {len(df.index)}")
print(f"Unique ImageID: {len(df.ImageId.unique())}")
write_owner_ids(df)
write_image_ids(df)
if __name__ == "__main__":
main()