-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcreate_dataset.py
182 lines (162 loc) · 7.63 KB
/
create_dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
import re
import pandas as pd
from scipy.stats import fisher_exact
def categorize_opinion(title_text: str) -> str:
"""Categorize opinion types based on their title text.
Args:
title_text (str): Title text of an opinion.
Returns:
str: fixed string categorizing the opinion (PARTLY, DISSENTING, CONCURRING, OPINION). Default is UNKOWN.
"""
if re.findall(r"PARTLY", title_text, re.I):
return "PARTLY"
if re.findall(r"DISSENTING", title_text, re.I):
return "DISSENTING"
if re.findall(r"CONCURRING", title_text, re.I):
return "CONCURRING"
if re.findall(r"OPINION", title_text, re.I):
return "OPINION"
else:
return "UNKNOWN"
def create_dataframe(
df_webcasts: pd.DataFrame,
df_announced: pd.DataFrame,
df_reported: pd.DataFrame,
df_questions: pd.DataFrame,
df_opinions: pd.DataFrame,
) -> pd.DataFrame:
""" Create the dataset questions and opinions from the previously loaded dataframes holding the required information.
Args:
df_webcasts (pd.DataFrame): Dataframe containing relevant webcasts
df_announced (pd.DataFrame): Dataframe containing all announced judges in press releases
df_reported (pd.DataFrame): Dataframe containing all reported judges in judgment documents
df_questions (pd.DataFrame): Dataframe containing all questions extracted from hearing transcripts
df_opinions (pd.DataFrame): Dataframe containing all opinions extracted from judgment documents
Returns:
pd.DataFrame: Dataframe with all participants of all hearings (attending both) with a corresponding (or lack of) question and opinion
"""
values = []
columns = [
"webcast_id",
"name",
"has_question",
"has_opinion",
"language",
"question",
"case_id",
"opinion",
"opinion_type",
]
for _, r in df_webcasts.iterrows():
# select current webcast id
w_id = r["webcast_id"]
# select all relevant rows from the dfs
announced = df_announced.loc[df_announced["webcast_id"] == w_id, "listed"]
reported = df_reported.loc[df_reported["webcast_id"] == w_id, "listed"]
set_announced = set(announced.iloc[0].split(","))
set_reported = set(reported.iloc[0].split(","))
# participants are only people present in announcement and judgment!
participants = set_announced.intersection(set_reported)
questions = df_questions.loc[
df_questions["webcast_id"] == w_id, ["name", "text", "lang"]
]
opinions = df_opinions.loc[
df_opinions["webcast_id"] == w_id, ["opinions", "case_id"]
].iloc[0]
# iterate through all participants in the current webcast
for p in participants:
# participant has a question
q = p in questions.values
# participant has an opinion
o = p in opinions["opinions"].keys()
question_text = (
questions.loc[questions["name"] == p, "text"].iloc[0] if q else ""
)
lang = questions.loc[questions["name"] == p, "lang"].iloc[0] if q else "en"
case_id = opinions["case_id"]
opinion_text = opinions["opinions"][p] if o else ""
opinion_type = categorize_opinion(opinions["opinions"][p][0]) if o else ""
entry = [w_id, p, q, o, lang, question_text, case_id, opinion_text, opinion_type]
values.append(entry)
# create df from extracted values
df_qando = pd.DataFrame(data=values, columns=columns)
return df_qando
def fisher_test(df_qando: pd.DataFrame) -> None:
""" Conduct the exact Fisher test on the question and opinion dataset.
Calculations are based on the fields "has_question" and "has_opinion".
Args:
df_qando (pd.DataFrame): Dataframe containing the question and opinion dataset.
"""
# Cross tabulation between hasQuestion and hasOpinion
crosstab_result = pd.crosstab(
index=df_qando["has_question"], columns=df_qando["has_opinion"]
)
# performing fishers exact test on the data
odd_ratio, p_value = fisher_exact(crosstab_result, alternative="two-sided")
print(f"Exact Fisher test odd ratio is : {odd_ratio:10.8f}")
print(f"Exact Fisher test P-Value is : {p_value:10.8f}")
def load_json_files(
fwebcasts: str = "selected_webcasts.json",
fquestions: str = "dataset_judge_questions.json",
fannounced: str = "judges_from_press.json",
freported: str = "judges_from_judgments.json",
fopinions: str = "opinions_from_judgments.json",
) -> (pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame):
""" Loads the json files containing all required infos into dataframes for easier processing.
Args:
fwebcasts (str, optional): Location of the webcast file. Defaults to "selected_webcasts.json".
fquestions (str, optional): Location of the question file. Defaults to "dataset_judge_questions.json".
fannounced (str, optional): Location of the announced judges file. Defaults to "judges_from_press.json".
freported (str, optional): Location of the reported judges file. Defaults to "judges_from_judgments.json".
fopinions (str, optional): Location of the opinions file. Defaults to "opinions_from_judgments.json".
Returns:
pd.DataFrame: Dataframe containing selected webcasts.
pd.DataFrame: Dataframe containing announced judges.
pd.DataFrame: Dataframe containing reported judges.
pd.DataFrame: Dataframe containing questions asked.
pd.DataFrame: Dataframe containing opinions from judgments.
"""
# load dataframes
df_selected_webcasts = pd.read_json(fwebcasts, dtype={"webcast_id": str})
df_questions = pd.read_json(fquestions, dtype={"webcast_id": str})
df_judges_announced = pd.read_json(fannounced, dtype={"webcast_id": str})
df_judges_reported = pd.read_json(freported, dtype={"webcast_id": str})
df_separate_opinions = pd.read_json(fopinions, dtype={"webcast_id": str})
# transform judges in announced into list
df_judges_announced["listed"] = df_judges_announced["judges"].apply(
lambda x: ",".join(x.values())
)
# homogenize dates
df_separate_opinions["hearing_date"] = pd.to_datetime(
df_separate_opinions["hearing_date"], unit="ms"
)
df_judges_reported["hearing_date"] = pd.to_datetime(
df_judges_reported["hearing_date"], unit="ms"
)
# select only opinions relevant for webcast selection
df_separate_opinions = df_separate_opinions.loc[
df_separate_opinions["webcast_id"].isin(df_selected_webcasts["webcast_id"])
]
return (
df_selected_webcasts,
df_judges_announced,
df_judges_reported,
df_questions,
df_separate_opinions,
)
if __name__ == "__main__":
# load the relevant data into dataframes
df_webcasts, df_announced, df_reported, df_questions, df_opinions = load_json_files(
"selected_webcasts.json",
"dataset_judge_questions.json",
"judges_from_press.json",
"judges_from_judgments.json",
"opinions_from_judgments.json",
)
df_opinions_questions = create_dataframe(
df_webcasts, df_announced, df_reported, df_questions, df_opinions
)
# save the questions and opinions dataset to a new json file
df_opinions_questions.to_json("dataset_questions_opinions.json")
# conduct the exact Fisher test on the dataset
fisher_test(df_opinions_questions)