-
Notifications
You must be signed in to change notification settings - Fork 1
/
3_formatCSVToAardvark.py
200 lines (187 loc) · 7.71 KB
/
3_formatCSVToAardvark.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
# Reads a CSV containing basic geospatial metadata and reformats it according
# to the OGM Aardvark metadata schema. Designed to format the output from 2_mergeCSVs.py.
# Import the required libraries
import pandas as pd
from datetime import datetime
# The prefix that will be used to name the output file
outputName = 'testdata'
# Define the CSV to be formatted
file = (outputName + '_2_merged.csv')
# Retrieve the current date and time
now = datetime.now()
# Format the date and time as: YYYY-MM-DDTHH:MM:SSZ
dt_string = now.strftime('%Y-%m-%dT%H:%M:%SZ')
# Define rows and columns for the output CSV
# Columns include all Aardvark fields; fields not used by UMass are commented out
rows = []
cols = ['dct_title_s',
# 'dct_alternative_sm',
'dct_description_sm',
'dct_language_sm',
'dct_creator_sm',
'dct_publisher_sm',
'schema_provider_s',
'gbl_resourceClass_sm',
'gbl_resourceType_sm',
'dct_subject_sm',
'dcat_theme_sm',
'dcat_keyword_sm',
'dct_temporal_sm',
'dct_issued_s',
'gbl_indexYear_im',
# 'gbl_dateRange_drsim',
'dct_spatial_sm',
'locn_geometry',
'dcat_bbox',
# 'dcat_centroid',
# 'dct_relation_sm',
'pcdm_memberOf_sm',
# 'dct_isPartOf_sm',
# 'dct_source_sm',
# 'dct_isVersionOf_sm',
# 'dct_replaces_sm',
# 'dct_isReplacedBy_sm',
'dct_rights_sm',
'dct_rightsHolder_sm',
# 'dct_license_sm',
'dct_accessRights_s',
'dct_format_s',
'gbl_fileSize_s',
# 'gbl_wxsIdentifier_s',
'dct_references_s',
'id',
'dct_identifier_sm',
'gbl_mdModified_dt',
'gbl_mdVersion_s',
'gbl_suppressed_b',
'gbl_georeferenced_b',
'umass_annotated_s',
'umass_geonameID_s']
# Read the CSV
csv_input = pd.read_csv(file)
# Process one row at a time
for i, row in csv_input.iterrows():
# Retrieve contents of specific fields
mods_ID = row['mods_ID']
geometry = row['geometry']
bbox = row['bbox']
geoname_ID = row['geoname_ID']
place = row['place']
townShort = row['town_short']
county = row['county']
state = row['state']
partNumber = row['titleInfo_partNumber']
placeTerm = row['place_placeTerm']
dateCreated = row['dateCreated']
year = row['year']
annotated = row['annotation']
# Figure out if placename is unique and format the `spatial` field
if place == townShort:
spatial = list([townShort + ', ' + state,
county + ', ' + state])
else:
spatial = list([place + ', ' + townShort + ', ' + state,
townShort + ', ' + state,
county + ', ' + state])
# Read the month and day from dateCreated
dateDT = datetime.strptime(dateCreated, "%Y-%m-%d")
month = dateDT.strftime('%B')
day = '{d.day}'.format(d=dateDT)
# Append the retrieved data to the columns list
rows.append({'dct_title_s': ('Aerial Photo of ' + townShort + ', ' + state + ' (' + partNumber + '), ' + str(year)),
# 'dct_alternative_sm',
'dct_description_sm': [
('Georeferenced black-and-white aerial photograph taken on ' + month + ' ' + day + ', '
+ str(year) + ', of ' + townShort + ' (' + county + ', ' + state
+ '). The photograph was scanned and manually georeferenced in ArcGIS software against 2019 '
'color orthoimagery from the U.S. Geological Survey. Its coordinate system is NAD 1983 State '
'Plane Massachusetts Mainland (EPSG 26986).')
],
'dct_language_sm': [
'eng'
],
'dct_creator_sm': [
'MacConnell, William Preston, 1918-'
],
'dct_publisher_sm': [
'University of Massachusetts at Amherst. Department of Forestry and Wildlife Management'
],
'schema_provider_s': 'UMass',
'gbl_resourceClass_sm': [
'Imagery'
],
'gbl_resourceType_sm': [
'Aerial photographs'
],
'dct_subject_sm': [
'Historical aerial imagery',
'Black-and-white photos',
'Landscape'
],
'dcat_theme_sm': [
'Imagery',
'Land Cover'
],
'dcat_keyword_sm': [
'maconnell',
'maconell',
'macconnel',
'macconel'
],
'dct_temporal_sm': [
str(year)
],
'dct_issued_s': dateCreated,
'gbl_indexYear_im': [
year
],
# 'gbl_dateRange_drsim': '',
'dct_spatial_sm': spatial,
'locn_geometry': geometry,
'dcat_bbox': bbox,
# 'dcat_centroid_ss': '',
# 'dct_relation_sm': '',
'pcdm_memberOf_sm': [
'umass-macconnell-1951'
],
# 'dct_isPartOf_sm': '',
# 'dct_source_sm': '',
# 'dct_isVersionOf_sm': '',
# 'dct_replaces_sm': '',
# 'dct_isReplacedBy_sm': '',
'dct_rights_sm': [
'Requests to publish, redistribute, or replicate this material should be addressed to Special '
'Collections and University Archives, University of Massachusetts Amherst Libraries. For more '
'information, see http://scua.library.umass.edu/services-at-scua/permissions/. '
],
'dct_rightsHolder_sm': [
'Special Collections and University Archives, University of Massachusetts Amherst Libraries'
],
# 'dct_license_sm': '',
'dct_accessRights_s': 'Public',
'dct_format_s': 'GeoTIFF',
'gbl_fileSize_s': 'about 90 MB',
# gbl_wxsIdentifier_s': '',
# Multiple download links (GeoTIFF and JPG)
'dct_references_s': "{\"http://schema.org/url\":\"https://credo.library.umass.edu/view/full/"
+ mods_ID + "\",\"http://schema.org/downloadUrl\":[{"
"\"url\":\"https://credo.library.umass.edu/media/"
+ mods_ID + ".reference.tif\",\"label\":\"GeoTIFF\"},"
"{\"url\":\"https://credo.library.umass.edu/images/resize/full/"
+ mods_ID + "-001.jpg\",\"label\":\"High-Res JPG\"}],"
"\"http://iiif.io/api/image\":\"https://credo.library.umass.edu/iiif/"
+ mods_ID + "-001.tif/info.json\"}",
'id': ('umass-' + mods_ID),
'dct_identifier_sm': [
mods_ID
],
'gbl_mdModified_dt': dt_string,
'gbl_mdVersion_s': 'Aardvark',
'gbl_suppressed_b': False,
'gbl_georeferenced_b': True,
'umass_annotated_s': annotated,
'umass_geonameID_s': geoname_ID
})
# Write the dataframe to csv
df = pd.DataFrame(rows, columns=cols)
df.to_csv(outputName + '_3_aardvark.csv', index=False)