Skip to content

Commit 32c5e3b

Browse files
Fix tests and clean up new deduplication algorithm code (#54)
1 parent 3d9b906 commit 32c5e3b

File tree

3 files changed

+201
-191
lines changed

3 files changed

+201
-191
lines changed

asreviewcontrib/datatools/dedup.py

+126-110
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
import re
2-
from argparse import Namespace
32
from difflib import SequenceMatcher
43

54
import ftfy
@@ -13,199 +12,216 @@
1312

1413

1514
def _print_similar_list(
16-
similar_list: list[tuple[int, int]],
17-
data: pd.Series,
18-
pid: str,
19-
pids: pd.Series = None
20-
) -> None:
21-
15+
similar_list: list[tuple[int, int]],
16+
data: pd.Series,
17+
pid: str,
18+
pids: pd.Series = None,
19+
) -> None:
2220
print_seq_matcher = SequenceMatcher()
2321
console = Console()
2422

2523
if pids is not None:
26-
print(f'Found similar titles or same {pid} at lines:')
24+
print(f"Found similar titles or same {pid} at lines:")
2725
else:
28-
print('Found similar titles at lines:')
26+
print("Found similar titles at lines:")
2927

3028
for i, j in similar_list:
3129
print_seq_matcher.set_seq1(data.iloc[i])
3230
print_seq_matcher.set_seq2(data.iloc[j])
3331
text = Text()
3432

3533
if pids is not None:
36-
text.append(f'\nLines {i+1} and {j+1} ', style='bold')
34+
text.append(f"\nLines {i + 1} and {j + 1} ", style="bold")
3735
if pids.iloc[i] == pids.iloc[j]:
38-
text.append(f'(same {pid} "{pids.iloc[i]}"):\n', style='dim')
36+
text.append(f'(same {pid} "{pids.iloc[i]}"):\n', style="dim")
3937
else:
40-
text.append(f'({pid} "{pids.iloc[i]}" and "{pids.iloc[j]}"):\n',
41-
style='dim')
38+
text.append(
39+
f'({pid} "{pids.iloc[i]}" and "{pids.iloc[j]}"):\n', style="dim"
40+
)
4241

4342
else:
44-
text.append(f'\nLines {i+1} and {j+1}:\n', style='bold')
43+
text.append(f"\nLines {i + 1} and {j + 1}:\n", style="bold")
4544

4645
for tag, i1, i2, j1, j2 in print_seq_matcher.get_opcodes():
47-
if tag == 'replace':
46+
if tag == "replace":
4847
# add rich strikethrough
49-
text.append(f'{data.iloc[i][i1:i2]}', style='red strike')
50-
text.append(f'{data.iloc[j][j1:j2]}', style='green')
51-
if tag == 'delete':
52-
text.append(f'{data.iloc[i][i1:i2]}', style='red strike')
53-
if tag == 'insert':
54-
text.append(f'{data.iloc[j][j1:j2]}', style='green')
55-
if tag == 'equal':
56-
text.append(f'{data.iloc[i][i1:i2]}', style='dim')
48+
text.append(f"{data.iloc[i][i1:i2]}", style="red strike")
49+
text.append(f"{data.iloc[j][j1:j2]}", style="green")
50+
if tag == "delete":
51+
text.append(f"{data.iloc[i][i1:i2]}", style="red strike")
52+
if tag == "insert":
53+
text.append(f"{data.iloc[j][j1:j2]}", style="green")
54+
if tag == "equal":
55+
text.append(f"{data.iloc[i][i1:i2]}", style="dim")
5756

5857
console.print(text)
5958

60-
print('')
59+
print("")
6160

6261

6362
def _drop_duplicates_by_similarity(
64-
asdata: ASReviewData,
65-
pid: str,
66-
similarity: float = 0.98,
67-
skip_abstract: bool = False,
68-
discard_stopwords: bool = False,
69-
stopwords_language: str = 'english',
70-
strict_similarity: bool = False,
71-
verbose: bool = False,
72-
) -> None:
73-
74-
if skip_abstract:
75-
data = asdata.df['title']
63+
asdata: ASReviewData,
64+
pid: str,
65+
threshold: float = 0.98,
66+
title_only: bool = False,
67+
stopwords_language: str = None,
68+
strict: bool = False,
69+
verbose: bool = False,
70+
) -> None:
71+
if title_only:
72+
data = asdata.df["title"]
7673
else:
7774
data = pd.Series(asdata.texts)
7875

79-
symbols_regex = re.compile(r'[^ \w\d\-_]')
80-
spaces_regex = re.compile(r'\s+')
76+
symbols_regex = re.compile(r"[^ \w\d\-_]")
77+
spaces_regex = re.compile(r"\s+")
8178

8279
# clean the data
8380
s = (
84-
data
85-
.apply(ftfy.fix_text)
86-
.str.replace(symbols_regex, '', regex=True)
87-
.str.replace(spaces_regex, ' ', regex=True)
81+
data.apply(ftfy.fix_text)
82+
.str.replace(symbols_regex, "", regex=True)
83+
.str.replace(spaces_regex, " ", regex=True)
8884
.str.lower()
8985
.str.strip()
90-
.replace('', None)
86+
.replace("", None)
9187
)
9288

93-
if discard_stopwords:
89+
if stopwords_language:
9490
try:
9591
from nltk.corpus import stopwords
92+
9693
stopwords_set = set(stopwords.words(stopwords_language))
9794
except LookupError:
9895
import nltk
99-
nltk.download('stopwords')
96+
97+
nltk.download("stopwords")
10098
stopwords_set = set(stopwords.words(stopwords_language))
10199

102-
stopwords_regex = re.compile(rf'\b{"\\b|\\b".join(stopwords_set)}\b')
103-
s = s.str.replace(stopwords_regex, '', regex=True)
100+
stopwords_regex = re.compile(rf"\b{'\\b|\\b'.join(stopwords_set)}\b")
101+
s = s.str.replace(stopwords_regex, "", regex=True)
104102

105103
seq_matcher = SequenceMatcher()
106104
duplicated = [False] * len(s)
107105

108-
if verbose:
109-
similar_list = []
110-
else:
111-
similar_list = None
112-
106+
similar_list = []
113107
if pid in asdata.df.columns:
114108
if is_string_dtype(asdata.df[pid]) or is_object_dtype(asdata.df[pid]):
115109
pids = asdata.df[pid].str.strip().replace("", None)
116110
if pid == "doi":
117111
pids = pids.str.lower().str.replace(
118112
r"^https?://(www\.)?doi\.org/", "", regex=True
119113
)
120-
121114
else:
122115
pids = asdata.df[pid]
123116

124-
for i, text in tqdm(s.items(), total=len(s), desc='Deduplicating'):
117+
for i, text in tqdm(s.items(), total=len(s), desc="Deduplicating"):
125118
seq_matcher.set_seq2(text)
126119

127120
# loop through the rest of the data if it has the same pid or similar length
128-
for j, t in s.iloc[i+1:][(asdata.df[pid] == asdata.df.iloc[i][pid]) |
129-
(abs(s.str.len() - len(text)) < 5)].items():
121+
for j, t in s.iloc[i + 1 :][
122+
(asdata.df[pid] == asdata.df.iloc[i][pid])
123+
| (abs(s.str.len() - len(text)) < 5)
124+
].items():
130125
seq_matcher.set_seq1(t)
131126

132127
# if the texts have the same pid or are similar enough,
133128
# mark the second one as duplicate
134-
if pids.iloc[i] == pids.iloc[j] or \
135-
(seq_matcher.real_quick_ratio() > similarity and \
136-
seq_matcher.quick_ratio() > similarity and \
137-
(not strict_similarity or seq_matcher.ratio() > similarity)):
138-
139-
if verbose and not duplicated[j]:
129+
if pids.iloc[i] == pids.iloc[j] or (
130+
seq_matcher.real_quick_ratio() > threshold
131+
and seq_matcher.quick_ratio() > threshold
132+
and (not strict or seq_matcher.ratio() > threshold)
133+
):
134+
if not duplicated[j]:
140135
similar_list.append((i, j))
141-
142136
duplicated[j] = True
143137

144-
if verbose:
145-
_print_similar_list(similar_list, data, pid, pids)
146-
147138
else:
148-
print(f'Not using {pid} for deduplication because there is no such data.')
139+
print(f"Not using {pid} for deduplication because there is no such data.")
149140

150-
for i, text in tqdm(s.items(), total=len(s), desc='Deduplicating'):
141+
for i, text in tqdm(s.items(), total=len(s), desc="Deduplicating"):
151142
seq_matcher.set_seq2(text)
152143

153144
# loop through the rest of the data if it has similar length
154-
for j, t in s.iloc[i+1:][abs(s.str.len() - len(text)) < 5].items():
145+
for j, t in s.iloc[i + 1 :][abs(s.str.len() - len(text)) < 5].items():
155146
seq_matcher.set_seq1(t)
156147

157148
# if the texts are similar enough, mark the second one as duplicate
158-
if seq_matcher.real_quick_ratio() > similarity and \
159-
seq_matcher.quick_ratio() > similarity and \
160-
(not strict_similarity or seq_matcher.ratio() > similarity):
161-
162-
if verbose and not duplicated[j]:
149+
if (
150+
seq_matcher.real_quick_ratio() > threshold
151+
and seq_matcher.quick_ratio() > threshold
152+
and (not strict or seq_matcher.ratio() > threshold)
153+
):
154+
if not duplicated[j]:
163155
similar_list.append((i, j))
164-
165156
duplicated[j] = True
166-
167-
if verbose:
168-
_print_similar_list(similar_list, data, pid)
169-
170157
asdata.df = asdata.df[~pd.Series(duplicated)].reset_index(drop=True)
171-
172-
173-
def deduplicate_data(asdata: ASReviewData, args: Namespace) -> None:
158+
if verbose:
159+
_print_similar_list(similar_list, data, pid)
160+
161+
162+
def deduplicate_data(
163+
asdata: ASReviewData,
164+
output_path: str = None,
165+
pid: str = "doi",
166+
similar: bool = False,
167+
threshold: float = 0.98,
168+
title_only: bool = False,
169+
stopwords_language: str = None,
170+
strict: bool = False,
171+
verbose: bool = False,
172+
) -> None:
173+
"""Deduplicate an ASReview data object.
174+
175+
Parameters
176+
----------
177+
asdata : ASReviewData
178+
The data object.
179+
output_path : str, optional
180+
If provided, the deduplicated data object is stored at this location. By
181+
default None.
182+
pid : str, optional
183+
Principal identifier to use for deduplication, by default "doi"
184+
similar : bool, optional
185+
Where to deduplicate 'similar' record. The similarity of the records is
186+
calculated using the `SequenceMatcher` from `difflib`. By default False.
187+
threshold : float, optional
188+
Threshold score above which two records are considered duplicate.
189+
By default 0.98. Only applies if `similar` is set to `True`.
190+
title_only : bool, optional
191+
Only use the title for deduplication, by default False
192+
stopwords_language : str, optional
193+
Remove stopwords from this language before deduplicating, for example 'english'.
194+
By default None. Only applies if `similar` is set to `True`.
195+
strict : bool, optional
196+
Use a stricter algorithm to calculate the similarity between records.
197+
By default False. Only applies if `similar` is set to `True`.
198+
verbose : bool, optional
199+
Get verbose output during deduplicating. By default False. Only applies if
200+
`similar` is set to `True`.
201+
"""
174202
initial_length = len(asdata.df)
175203

176-
if not args.similar:
177-
if args.pid not in asdata.df.columns:
178-
print(
179-
f'Not using {args.pid} for deduplication '
180-
'because there is no such data.'
181-
)
204+
if not similar:
205+
if pid not in asdata.df.columns:
206+
print(f"Not using {pid} for deduplication because there is no such data.")
182207

183208
# retrieve deduplicated ASReview data object
184-
asdata.drop_duplicates(pid=args.pid, inplace=True)
209+
asdata.drop_duplicates(pid=pid, inplace=True)
185210

186211
else:
187212
_drop_duplicates_by_similarity(
188-
asdata,
189-
args.pid,
190-
args.threshold,
191-
args.title_only,
192-
args.stopwords,
193-
args.stopwords_language,
194-
args.strict,
195-
args.verbose,
196-
)
213+
asdata=asdata,
214+
pid=pid,
215+
threshold=threshold,
216+
title_only=title_only,
217+
stopwords_language=stopwords_language,
218+
strict=strict,
219+
verbose=verbose,
220+
)
221+
222+
if output_path:
223+
asdata.to_file(output_path)
197224

198225
# count duplicates
199226
n_dup = initial_length - len(asdata.df)
200-
201-
if args.output_path:
202-
asdata.to_file(args.output_path)
203-
print(
204-
f'Removed {n_dup} duplicates from dataset with'
205-
f' {initial_length} records.'
206-
)
207-
else:
208-
print(
209-
f'Found {n_dup} duplicates in dataset with'
210-
f' {initial_length} records.'
211-
)
227+
print(f"Found {n_dup} duplicates in dataset with {initial_length} records.")

0 commit comments

Comments
 (0)