-
Notifications
You must be signed in to change notification settings - Fork 0
/
cbdb_llm_eval.py
145 lines (117 loc) · 4.24 KB
/
cbdb_llm_eval.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
import pandas as pd
import sqlite3
import random
df = pd.DataFrame(columns=['question', 'answer'])
conn = sqlite3.connect('cbdb.db')
c = conn.cursor()
c.execute('SELECT count(*) FROM BIOG_MAIN')
total = c.fetchone()[0]
print('Total number of records:', total)
# Entry method
# Juren
c.execute('''
SELECT ed.c_personid, bm.c_name_chn
FROM ENTRY_DATA ed
JOIN BIOG_MAIN bm ON ed.c_personid = bm.c_personid
WHERE ed.c_personid IN (
SELECT c_personid
FROM ENTRY_DATA
GROUP BY c_personid
HAVING COUNT(DISTINCT c_entry_code) = 1 AND MAX(c_entry_code) = 39
)
''')
rows = c.fetchall()
row_sample = random.sample(rows, min(len(rows), 10))
person_ids = [row[0] for row in row_sample]
for row in row_sample:
new_row = {'question': f"{row[1]}(c_personid={row[0]})" + '的入仕方式是?', 'answer': '舉人'}
df = pd.concat([df, pd.DataFrame([new_row])], ignore_index=True)
# Jinshi
c.execute('''
SELECT ed.c_personid, bm.c_name_chn
FROM ENTRY_DATA ed
JOIN BIOG_MAIN bm ON ed.c_personid = bm.c_personid
WHERE ed.c_personid IN (
SELECT c_personid
FROM ENTRY_DATA
GROUP BY c_personid
HAVING COUNT(DISTINCT c_entry_code) = 1 AND MAX(c_entry_code) = 36
)
''')
rows = c.fetchall()
row_sample = random.sample(rows, min(len(rows), 10))
person_ids += [row[0] for row in row_sample]
for row in row_sample:
new_row = {'question': f"{row[1]}(c_personid={row[0]})" + '的入仕方式是?', 'answer': '進士'}
df = pd.concat([df, pd.DataFrame([new_row])], ignore_index=True)
# Yin
c.execute('''
SELECT ed.c_personid, bm.c_name_chn
FROM ENTRY_DATA ed
JOIN BIOG_MAIN bm ON ed.c_personid = bm.c_personid
WHERE ed.c_personid IN (
SELECT c_personid
FROM ENTRY_DATA
GROUP BY c_personid
HAVING COUNT(DISTINCT c_entry_code) = 1 AND MAX(c_entry_code) = 118
)
''')
rows = c.fetchall()
row_sample = random.sample(rows, min(len(rows), 10))
person_ids += [row[0] for row in row_sample]
for row in row_sample:
new_row = {'question': f"{row[1]}(c_personid={row[0]})" + '的入仕方式是?', 'answer': '恩蔭'}
df = pd.concat([df, pd.DataFrame([new_row])], ignore_index=True)
# Biographical address
# Biographical address simple question
c.execute('''
SELECT bm.c_personid, bm.c_name_chn, ac.c_name_chn
FROM BIOG_MAIN bm
JOIN ADDR_CODES ac ON bm.c_index_addr_id = ac.c_addr_id
WHERE ac.c_name_chn NOT LIKE '%旗'
AND LENGTH(ac.c_name_chn) BETWEEN 2 AND 3;
''')
rows = c.fetchall()
row_sample = random.sample(rows, min(len(rows), 10))
person_ids += [row[0] for row in row_sample]
for row in row_sample:
new_row = {'question': f"{row[1]}(c_personid={row[0]})" + '的籍貫是?', 'answer': row[2]}
df = pd.concat([df, pd.DataFrame([new_row])], ignore_index=True)
# Biographical address belongs question
# from BIOG_MAIN, get c_index_addr_id, c_personid, c_name_chn, then ADDR_BELONGS_DATA's c_addr_id to get c_belongs_to
# Then using c_belongs_to to join ADDR_CODES's c_addr_id, get c_addr_chn
c.execute('''
SELECT
bm.c_personid,
bm.c_name_chn AS biog_name,
ac1.c_name_chn AS belongs_to_name,
ac2.c_name_chn AS index_addr_name
FROM BIOG_MAIN bm
JOIN ADDR_BELONGS_DATA abd ON bm.c_index_addr_id = abd.c_addr_id
JOIN ADDR_CODES ac1 ON abd.c_belongs_to = ac1.c_addr_id
JOIN ADDR_CODES ac2 ON bm.c_index_addr_id = ac2.c_addr_id
WHERE ac1.c_name_chn NOT LIKE '%旗'
AND ac1.c_name_chn LIKE '%府'
AND LENGTH(ac2.c_name_chn) > 2;
''')
rows = c.fetchall()
row_sample = random.sample(rows, min(len(rows), 10))
person_ids += [row[0] for row in row_sample]
person_ids = list(set(person_ids))
for row in row_sample:
new_rows = [
{'question': f"{row[1]}(c_personid={row[0]}的籍貫是否為{row[2]}?", 'answer': "是"},
{'question': f"{row[1]}(c_personid={row[0]}的籍貫是否為{row[3][1:]}?", 'answer': "否"}
]
df = pd.concat([df, pd.DataFrame(new_rows)], ignore_index=True)
df = df.sample(frac=1).reset_index(drop=True)
print(df.head(10))
# write to csv and excel
df.to_csv('cbdb_llm_eval.csv', index=False, encoding='utf-8-sig')
df.to_excel('cbdb_llm_eval.xlsx', index=False)
# write person_ids to txt
with open('person_ids.txt', 'w') as f:
for person_id in person_ids:
f.write(f"{person_id}\n")
conn.close()
print('Done')