-
-
Notifications
You must be signed in to change notification settings - Fork 1
/
cleaned_movies_generator.py
43 lines (38 loc) · 1.62 KB
/
cleaned_movies_generator.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
import re
import pandas as pd
from pathlib import Path
def get_year(title):
matches = re.findall(r'\([0-2][0-9][0-9][0-9]\)$', title)
if len(matches) > 0:
try:
return int(matches[-1][1:-1])
except:
return 0
return 0
def fix_the(title):
has_the = len(re.findall(r', The$', title)) > 0
if has_the:
return 'The ' + re.sub(r', The$', '', title).strip()
else:
return title
cur_dir = Path(__file__)
raw_mv = pd.read_csv(cur_dir.parent / './ml-25m/movies.csv')
# just some unicode things, to resolve spaces. Then, strip all whitespace from ends
raw_mv.title = raw_mv.title.str.replace('\xa0', ' ').str.strip()
# get the year and remove all content in parentheses from title
raw_mv = raw_mv.assign(year=[get_year(t) for t in raw_mv.title])\
.assign(title=[re.sub(r'\(.*?\)', '', t).strip() for t in raw_mv.title])
# special case of this movie having parentheses in it's name
raw_mv.at[15292, 'title'] = '(Untitled)'
# movies beginning with "The" are stored as <Name>, The
# this should be fixed, since it is now how people specify movies
raw_mv = raw_mv.assign(title=[fix_the(t) for t in raw_mv.title])
# makes more sense for missing values to be represented this way
raw_mv.genres = raw_mv.genres.astype('string').str.replace('(no genres listed)', 'NA')
links = pd.read_csv(cur_dir.parent / './processed_data/links.csv')
# tmdb Id is not necessary
links = links.drop('tmdbId', axis=1)
# final merged dataset
cleaned_mv = links.merge(raw_mv, left_on='movieId', right_on='movieId')
# save to file
cleaned_mv.to_csv(cur_dir.parent / './ml-25m/clean_movies.csv', index=False)