-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathclassifyArticle.py
251 lines (193 loc) Β· 8.96 KB
/
classifyArticle.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
import os
import openai
import pypdf
import nltk
from nltk.tokenize import sent_tokenize
from dotenv import load_dotenv
from article import Article
from criteria import CriteriaStore
from habanero import Crossref
# Load API key from .env file
load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
# Ensure NLTK dependencies are available
nltk.download('punkt')
# π Define research method categories
RESEARCH_METHODS = [
"Data Science"
"Engineering Research",
"Design Science",
"Experiments",
"Grounded Theory",
"Longitudinal",
"Meta Science",
"Optimization",
"Qualitative Survey",
"Quantitative Survey",
"Quantitative Simulation",
"Qualitative Simulation",
"Questionnaire Survey",
"Replication",
"Repository Mining",
"Systematic Review"
]
def extract_text_from_pdf(pdf_path, max_chars=4000):
"""Extracts text from a PDF file, limiting to max_chars for efficiency."""
pdf_reader = pypdf.PdfReader(pdf_path)
text = ""
for page in pdf_reader.pages:
text += page.extract_text() + "\n"
# Tokenize and limit the text length
sentences = sent_tokenize(text)
trimmed_text = " ".join(sentences[:max_chars // 20]) # Approximate sentence count
return trimmed_text
def classify_research_method(pdf_path, model="gpt-4-turbo"):
"""Classifies a research article using OpenAI GPT API (Latest Version)."""
# Step 1: Extract text from PDF
article_text = extract_text_from_pdf(pdf_path)
# Step 2: Construct structured prompt
prompt = f"""
Please classify the following text into one of the following research methods:
{", ".join(RESEARCH_METHODS)}.
Please provide your result as follows:
Title: The article's title
Resulting Method: The resulting method. If you do not find a result, please return "mixed method".
Do not return anything else.
Article Text:
{article_text}
"""
# Step 3: Initialize OpenAI client correctly
client = openai.OpenAI(api_key=OPENAI_API_KEY)
# Step 4: Call OpenAI API
response = client.chat.completions.create(
model=model,
messages=[{"role": "user", "content": prompt}]
)
return response.choices[0].message.content.strip()
def get_doi_by_title(title):
"""
Searches for a DOI based on an article title using the Crossref API via the habanero package.
:param title: The title of the article.
:return: The DOI if found, otherwise None.
"""
cr = Crossref()
results = cr.works(query_bibliographic=title, rows=1)
if results['message']['items']:
return results['message']['items'][0].get('DOI')
return None
# Example usage
# π₯ Run classification on a PDF
directory = "./articles/"
assistant_id = "asst_UqLB2dxyKWFXqrpTHMh05Eai"
for filename in os.listdir(directory):
if not filename.lower().endswith(".pdf"):
continue # Skip non-PDF files
filepath = os.path.join(directory, filename)
print(f"\nπ Processing: {filepath}")
# π₯ Step 1: Start a new OpenAI thread for this article
thread = openai.beta.threads.create()
thread_id = thread.id
print(f"β
New Thread Created: {thread_id}")
# πΉ Step 2: Extract a small portion of the PDF for title detection
pdf_excerpt = extract_text_from_pdf(filepath, max_chars=1000)
# π₯ Step 3: Ask OpenAI for the title
openai.beta.threads.messages.create(
thread_id=thread_id,
role="user",
content=f"Extract the title from this research paper:\n\n{pdf_excerpt}\n\nReturn only: Title: <title>"
)
# π Run the assistant
run = openai.beta.threads.runs.create(thread_id=thread_id, assistant_id=assistant_id)
# β³ Wait for OpenAI to process
while run.status in ["queued", "in_progress"]:
run = openai.beta.threads.runs.retrieve(thread_id=thread_id, run_id=run.id)
# π₯ Retrieve the title
messages = openai.beta.threads.messages.list(thread_id=thread_id)
title_response = messages.data[0].content[0].text.value
title = title_response.replace("Title: ", "").strip()
print(f"πΉ Extracted Title: {title}")
# π Step 4: Check if DOI exists in Firestore
try:
doi = get_doi_by_title(title)
except:
print("DOI retrieval failed, let's try once more.")
doi = get_doi_by_title(title)
if not doi:
print("β Sorry, article's DOI not found by title...")
else:
print(f"β
DOI found: {doi}")
if Article.does_doi_exist(doi): # in DB
print(f"Article already in DB, no need to extract RM")
resulting_method = ""
else:
# π₯ Step 5: Extract more of the PDF for research method classification
pdf_excerpt_longer = extract_text_from_pdf(filepath, max_chars=5000)
# π₯ Ask OpenAI for the research method
openai.beta.threads.messages.create(
thread_id=thread_id,
role="user",
content=f"Classify the research method of this research paper, based on exactly these research methods {", ".join(RESEARCH_METHODS)}. :\n\n{pdf_excerpt_longer}\n\nReturn only: Resulting Method: <method>"
)
# π Run the assistant again
run = openai.beta.threads.runs.create(thread_id=thread_id, assistant_id=assistant_id)
# β³ Wait for completion
while run.status in ["queued", "in_progress"]:
run = openai.beta.threads.runs.retrieve(thread_id=thread_id, run_id=run.id)
# π₯ Retrieve the research method
messages = openai.beta.threads.messages.list(thread_id=thread_id)
method_response = messages.data[0].content[0].text.value
resulting_method = method_response.replace("Resulting Method: ", "").strip()
print(f"πΉ Extracted Research Method: {resulting_method}")
# Here comes the big old set of questions for the resulting method
criteria_store = CriteriaStore()
criteria = criteria_store.is_criteria_available(resulting_method )
rmQuality = ""
if criteria:
print(f"\nβ
Criteria for {resulting_method}:")
for c in criteria:
prompt = criteria_store.generate_prompt_for_criterion(resulting_method, c['description'])
print(f"\nπ Sending to OpenAI: {prompt}")
# π₯ Send a single prompt to OpenAI
openai.beta.threads.messages.create(
thread_id=thread_id,
role="user",
content=prompt
)
# π Run the assistant for just this one criterion
run = openai.beta.threads.runs.create(thread_id=thread_id, assistant_id=assistant_id)
# β³ Wait for OpenAI to complete processing
while run.status in ["queued", "in_progress"]:
run = openai.beta.threads.runs.retrieve(thread_id=thread_id, run_id=run.id)
# π₯ Retrieve the response
messages = openai.beta.threads.messages.list(thread_id=thread_id)
assistant_messages = [msg for msg in messages.data if msg.role == "assistant"]
if assistant_messages:
latest_message = max(assistant_messages, key=lambda msg: msg.created_at) # Get the latest by timestamp
print(latest_message)
response_text = latest_message.content[0].text.value.strip()
yes_no_answer = response_text.split("\n")[-1].strip() # Extract last line (Yes/No)
if yes_no_answer not in ["Yes", "No"]:
yes_no_answer = "Unknown"
else:
yes_no_answer = "Unknown"
# Store result
rmQuality += f"{c['description']} {yes_no_answer}\n"
print(f"β
Extracted: ({c['description']}, {yes_no_answer})")
else:
print(f"\nβ No criteria found for {resulting_method}.")
# πΉ Step 6: Store the article in Firestore
article = Article(
name=title,
research_method=resulting_method,
doi=doi if doi else "no doi",
articleQuality={},
rmQuality=rmQuality,
filenames={filename}
)
article.save_to_firestore()
# π₯ Step 7: Close the OpenAI thread (optional)
final_run = openai.beta.threads.runs.retrieve(thread_id=thread_id, run_id=run.id)
token_usage = final_run.usage # Contains token info
print(final_run.usage)
openai.beta.threads.delete(thread_id)
print(f"β
Thread {thread_id} closed.")