-
Notifications
You must be signed in to change notification settings - Fork 3
/
deal_with_client_expert_same_line.py
98 lines (82 loc) · 3.72 KB
/
deal_with_client_expert_same_line.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
"""
python deal_with_client_expert_same_line.py
separates out client and expert utterances
allows a switch to determine if one goes first or second
optionally lets strip a particular string for instance any utterances which are "WELCOME"
"""
# ******************************************************************************************************************120
# standard imports
# 3rd party imports
import click
import pandas
@click.command()
@click.option('-f', '--filename', type=str, required=True, help='CSV Filename to operate on')
@click.option('-c', '--convo_id_col', type=str, required=True, help='Conversation ID column')
@click.option('-t', '--timestamp_col', type=str, required=True, help='Name column has the timestamp in')
@click.option('-i', '--client_col', type=str, required=True, help='Name of column with the client utterance in')
@click.option('-e', '--expert_col', type=str, required=True, help='Name of column with the expert utterance in')
@click.option('-d', '--drop_this', type=str, required=False, default='', help='Drop any utterances matching this.')
@click.option('-b', '--begin_client', is_flag=True, required=False, default=False,
help='Whether client or expert is expected to speak first')
def main(filename: str,
convo_id_col: str,
timestamp_col: str,
client_col: str, expert_col: str,
drop_this: str,
begin_client: bool
) -> None: # pylint: disable=unused-argument
"""Main Function"""
# read df and check cols and deal with excel
file_type = None
if filename.endswith(".xlsx"):
file_type = ".xlsx"
df = pandas.read_excel(filename,dtype=str) # remember - loading excel is slow.
elif filename.endswith(".csv"):
file_type = ".csv"
df = pandas.read_csv(filename,dtype=str,encoding="utf8")
else:
raise RuntimeError("Unsupported file format, only works with xlsx or utf8 csv")
cols = df.columns.to_list()
for col in [convo_id_col,timestamp_col,client_col,expert_col]:
try:
assert col in cols
except AssertionError as e: # pylint: disable=unused-variable
print(f'Couldn\'t find col {col}')
print(cols)
quit()
print('All columns found')
print(df)
# baseline expert timestamp
df["expert_timestamp"] = pandas.to_datetime(df[timestamp_col])
# add or takeaway 1 ms based on expert of client first
if begin_client:
delta = -1
else:
delta = 1
df["client_timestamp"] = df["expert_timestamp"] + pandas.to_timedelta(delta,"ms") # ms = microseoncds
# separate out client
df_client = df.copy(deep=True)
df_client.drop(columns=["expert_timestamp",timestamp_col,expert_col],inplace=True)
df_client.rename(columns={"client_timestamp":timestamp_col,client_col:"text"},inplace=True)
df_client["input_role"] = "client"
print(df_client)
# separate out expert
df_expert = df.copy(deep=True)
df_expert.drop(columns=["client_timestamp",timestamp_col,client_col],inplace=True)
df_expert.rename(columns={"expert_timestamp":timestamp_col,expert_col:"text"},inplace=True)
df_expert["input_role"] = "expert"
print(df_expert)
# concatenate overwriting df
df = pandas.concat([df_expert,df_client],axis=0)
df.sort_values([convo_id_col,timestamp_col],inplace=True)
# delete any strip utterances
if drop_this != '':
df = df[~(df["text"]==drop_this)]
print(df)
# write to output
output_filename = filename.replace(file_type, '_output.csv')
assert filename != output_filename
df.to_csv(output_filename,index=False,header=True)
print(f'Wrote to: {output_filename}')
if __name__ == '__main__':
main() # pylint: disable=no-value-for-parameter