|
1 | 1 | import re
|
2 |
| -from argparse import Namespace |
3 | 2 | from difflib import SequenceMatcher
|
4 | 3 |
|
5 | 4 | import ftfy
|
|
13 | 12 |
|
14 | 13 |
|
15 | 14 | def _print_similar_list(
|
16 |
| - similar_list: list[tuple[int, int]], |
17 |
| - data: pd.Series, |
18 |
| - pid: str, |
19 |
| - pids: pd.Series = None |
20 |
| - ) -> None: |
21 |
| - |
| 15 | + similar_list: list[tuple[int, int]], |
| 16 | + data: pd.Series, |
| 17 | + pid: str, |
| 18 | + pids: pd.Series = None, |
| 19 | +) -> None: |
22 | 20 | print_seq_matcher = SequenceMatcher()
|
23 | 21 | console = Console()
|
24 | 22 |
|
25 | 23 | if pids is not None:
|
26 |
| - print(f'Found similar titles or same {pid} at lines:') |
| 24 | + print(f"Found similar titles or same {pid} at lines:") |
27 | 25 | else:
|
28 |
| - print('Found similar titles at lines:') |
| 26 | + print("Found similar titles at lines:") |
29 | 27 |
|
30 | 28 | for i, j in similar_list:
|
31 | 29 | print_seq_matcher.set_seq1(data.iloc[i])
|
32 | 30 | print_seq_matcher.set_seq2(data.iloc[j])
|
33 | 31 | text = Text()
|
34 | 32 |
|
35 | 33 | if pids is not None:
|
36 |
| - text.append(f'\nLines {i+1} and {j+1} ', style='bold') |
| 34 | + text.append(f"\nLines {i + 1} and {j + 1} ", style="bold") |
37 | 35 | if pids.iloc[i] == pids.iloc[j]:
|
38 |
| - text.append(f'(same {pid} "{pids.iloc[i]}"):\n', style='dim') |
| 36 | + text.append(f'(same {pid} "{pids.iloc[i]}"):\n', style="dim") |
39 | 37 | else:
|
40 |
| - text.append(f'({pid} "{pids.iloc[i]}" and "{pids.iloc[j]}"):\n', |
41 |
| - style='dim') |
| 38 | + text.append( |
| 39 | + f'({pid} "{pids.iloc[i]}" and "{pids.iloc[j]}"):\n', style="dim" |
| 40 | + ) |
42 | 41 |
|
43 | 42 | else:
|
44 |
| - text.append(f'\nLines {i+1} and {j+1}:\n', style='bold') |
| 43 | + text.append(f"\nLines {i + 1} and {j + 1}:\n", style="bold") |
45 | 44 |
|
46 | 45 | for tag, i1, i2, j1, j2 in print_seq_matcher.get_opcodes():
|
47 |
| - if tag == 'replace': |
| 46 | + if tag == "replace": |
48 | 47 | # add rich strikethrough
|
49 |
| - text.append(f'{data.iloc[i][i1:i2]}', style='red strike') |
50 |
| - text.append(f'{data.iloc[j][j1:j2]}', style='green') |
51 |
| - if tag == 'delete': |
52 |
| - text.append(f'{data.iloc[i][i1:i2]}', style='red strike') |
53 |
| - if tag == 'insert': |
54 |
| - text.append(f'{data.iloc[j][j1:j2]}', style='green') |
55 |
| - if tag == 'equal': |
56 |
| - text.append(f'{data.iloc[i][i1:i2]}', style='dim') |
| 48 | + text.append(f"{data.iloc[i][i1:i2]}", style="red strike") |
| 49 | + text.append(f"{data.iloc[j][j1:j2]}", style="green") |
| 50 | + if tag == "delete": |
| 51 | + text.append(f"{data.iloc[i][i1:i2]}", style="red strike") |
| 52 | + if tag == "insert": |
| 53 | + text.append(f"{data.iloc[j][j1:j2]}", style="green") |
| 54 | + if tag == "equal": |
| 55 | + text.append(f"{data.iloc[i][i1:i2]}", style="dim") |
57 | 56 |
|
58 | 57 | console.print(text)
|
59 | 58 |
|
60 |
| - print('') |
| 59 | + print("") |
61 | 60 |
|
62 | 61 |
|
63 | 62 | def _drop_duplicates_by_similarity(
|
64 |
| - asdata: ASReviewData, |
65 |
| - pid: str, |
66 |
| - similarity: float = 0.98, |
67 |
| - skip_abstract: bool = False, |
68 |
| - discard_stopwords: bool = False, |
69 |
| - stopwords_language: str = 'english', |
70 |
| - strict_similarity: bool = False, |
71 |
| - verbose: bool = False, |
72 |
| - ) -> None: |
73 |
| - |
74 |
| - if skip_abstract: |
75 |
| - data = asdata.df['title'] |
| 63 | + asdata: ASReviewData, |
| 64 | + pid: str, |
| 65 | + threshold: float = 0.98, |
| 66 | + title_only: bool = False, |
| 67 | + stopwords_language: str = None, |
| 68 | + strict: bool = False, |
| 69 | + verbose: bool = False, |
| 70 | +) -> None: |
| 71 | + if title_only: |
| 72 | + data = asdata.df["title"] |
76 | 73 | else:
|
77 | 74 | data = pd.Series(asdata.texts)
|
78 | 75 |
|
79 |
| - symbols_regex = re.compile(r'[^ \w\d\-_]') |
80 |
| - spaces_regex = re.compile(r'\s+') |
| 76 | + symbols_regex = re.compile(r"[^ \w\d\-_]") |
| 77 | + spaces_regex = re.compile(r"\s+") |
81 | 78 |
|
82 | 79 | # clean the data
|
83 | 80 | s = (
|
84 |
| - data |
85 |
| - .apply(ftfy.fix_text) |
86 |
| - .str.replace(symbols_regex, '', regex=True) |
87 |
| - .str.replace(spaces_regex, ' ', regex=True) |
| 81 | + data.apply(ftfy.fix_text) |
| 82 | + .str.replace(symbols_regex, "", regex=True) |
| 83 | + .str.replace(spaces_regex, " ", regex=True) |
88 | 84 | .str.lower()
|
89 | 85 | .str.strip()
|
90 |
| - .replace('', None) |
| 86 | + .replace("", None) |
91 | 87 | )
|
92 | 88 |
|
93 |
| - if discard_stopwords: |
| 89 | + if stopwords_language: |
94 | 90 | try:
|
95 | 91 | from nltk.corpus import stopwords
|
| 92 | + |
96 | 93 | stopwords_set = set(stopwords.words(stopwords_language))
|
97 | 94 | except LookupError:
|
98 | 95 | import nltk
|
99 |
| - nltk.download('stopwords') |
| 96 | + |
| 97 | + nltk.download("stopwords") |
100 | 98 | stopwords_set = set(stopwords.words(stopwords_language))
|
101 | 99 |
|
102 |
| - stopwords_regex = re.compile(rf'\b{"\\b|\\b".join(stopwords_set)}\b') |
103 |
| - s = s.str.replace(stopwords_regex, '', regex=True) |
| 100 | + stopwords_regex = re.compile(rf"\b{'\\b|\\b'.join(stopwords_set)}\b") |
| 101 | + s = s.str.replace(stopwords_regex, "", regex=True) |
104 | 102 |
|
105 | 103 | seq_matcher = SequenceMatcher()
|
106 | 104 | duplicated = [False] * len(s)
|
107 | 105 |
|
108 |
| - if verbose: |
109 |
| - similar_list = [] |
110 |
| - else: |
111 |
| - similar_list = None |
112 |
| - |
| 106 | + similar_list = [] |
113 | 107 | if pid in asdata.df.columns:
|
114 | 108 | if is_string_dtype(asdata.df[pid]) or is_object_dtype(asdata.df[pid]):
|
115 | 109 | pids = asdata.df[pid].str.strip().replace("", None)
|
116 | 110 | if pid == "doi":
|
117 | 111 | pids = pids.str.lower().str.replace(
|
118 | 112 | r"^https?://(www\.)?doi\.org/", "", regex=True
|
119 | 113 | )
|
120 |
| - |
121 | 114 | else:
|
122 | 115 | pids = asdata.df[pid]
|
123 | 116 |
|
124 |
| - for i, text in tqdm(s.items(), total=len(s), desc='Deduplicating'): |
| 117 | + for i, text in tqdm(s.items(), total=len(s), desc="Deduplicating"): |
125 | 118 | seq_matcher.set_seq2(text)
|
126 | 119 |
|
127 | 120 | # loop through the rest of the data if it has the same pid or similar length
|
128 |
| - for j, t in s.iloc[i+1:][(asdata.df[pid] == asdata.df.iloc[i][pid]) | |
129 |
| - (abs(s.str.len() - len(text)) < 5)].items(): |
| 121 | + for j, t in s.iloc[i + 1 :][ |
| 122 | + (asdata.df[pid] == asdata.df.iloc[i][pid]) |
| 123 | + | (abs(s.str.len() - len(text)) < 5) |
| 124 | + ].items(): |
130 | 125 | seq_matcher.set_seq1(t)
|
131 | 126 |
|
132 | 127 | # if the texts have the same pid or are similar enough,
|
133 | 128 | # mark the second one as duplicate
|
134 |
| - if pids.iloc[i] == pids.iloc[j] or \ |
135 |
| - (seq_matcher.real_quick_ratio() > similarity and \ |
136 |
| - seq_matcher.quick_ratio() > similarity and \ |
137 |
| - (not strict_similarity or seq_matcher.ratio() > similarity)): |
138 |
| - |
139 |
| - if verbose and not duplicated[j]: |
| 129 | + if pids.iloc[i] == pids.iloc[j] or ( |
| 130 | + seq_matcher.real_quick_ratio() > threshold |
| 131 | + and seq_matcher.quick_ratio() > threshold |
| 132 | + and (not strict or seq_matcher.ratio() > threshold) |
| 133 | + ): |
| 134 | + if not duplicated[j]: |
140 | 135 | similar_list.append((i, j))
|
141 |
| - |
142 | 136 | duplicated[j] = True
|
143 | 137 |
|
144 |
| - if verbose: |
145 |
| - _print_similar_list(similar_list, data, pid, pids) |
146 |
| - |
147 | 138 | else:
|
148 |
| - print(f'Not using {pid} for deduplication because there is no such data.') |
| 139 | + print(f"Not using {pid} for deduplication because there is no such data.") |
149 | 140 |
|
150 |
| - for i, text in tqdm(s.items(), total=len(s), desc='Deduplicating'): |
| 141 | + for i, text in tqdm(s.items(), total=len(s), desc="Deduplicating"): |
151 | 142 | seq_matcher.set_seq2(text)
|
152 | 143 |
|
153 | 144 | # loop through the rest of the data if it has similar length
|
154 |
| - for j, t in s.iloc[i+1:][abs(s.str.len() - len(text)) < 5].items(): |
| 145 | + for j, t in s.iloc[i + 1 :][abs(s.str.len() - len(text)) < 5].items(): |
155 | 146 | seq_matcher.set_seq1(t)
|
156 | 147 |
|
157 | 148 | # if the texts are similar enough, mark the second one as duplicate
|
158 |
| - if seq_matcher.real_quick_ratio() > similarity and \ |
159 |
| - seq_matcher.quick_ratio() > similarity and \ |
160 |
| - (not strict_similarity or seq_matcher.ratio() > similarity): |
161 |
| - |
162 |
| - if verbose and not duplicated[j]: |
| 149 | + if ( |
| 150 | + seq_matcher.real_quick_ratio() > threshold |
| 151 | + and seq_matcher.quick_ratio() > threshold |
| 152 | + and (not strict or seq_matcher.ratio() > threshold) |
| 153 | + ): |
| 154 | + if not duplicated[j]: |
163 | 155 | similar_list.append((i, j))
|
164 |
| - |
165 | 156 | duplicated[j] = True
|
166 |
| - |
167 |
| - if verbose: |
168 |
| - _print_similar_list(similar_list, data, pid) |
169 |
| - |
170 | 157 | asdata.df = asdata.df[~pd.Series(duplicated)].reset_index(drop=True)
|
171 |
| - |
172 |
| - |
173 |
| -def deduplicate_data(asdata: ASReviewData, args: Namespace) -> None: |
| 158 | + if verbose: |
| 159 | + _print_similar_list(similar_list, data, pid) |
| 160 | + |
| 161 | + |
| 162 | +def deduplicate_data( |
| 163 | + asdata: ASReviewData, |
| 164 | + output_path: str = None, |
| 165 | + pid: str = "doi", |
| 166 | + similar: bool = False, |
| 167 | + threshold: float = 0.98, |
| 168 | + title_only: bool = False, |
| 169 | + stopwords_language: str = None, |
| 170 | + strict: bool = False, |
| 171 | + verbose: bool = False, |
| 172 | +) -> None: |
| 173 | + """Deduplicate an ASReview data object. |
| 174 | +
|
| 175 | + Parameters |
| 176 | + ---------- |
| 177 | + asdata : ASReviewData |
| 178 | + The data object. |
| 179 | + output_path : str, optional |
| 180 | + If provided, the deduplicated data object is stored at this location. By |
| 181 | + default None. |
| 182 | + pid : str, optional |
| 183 | + Principal identifier to use for deduplication, by default "doi" |
| 184 | + similar : bool, optional |
| 185 | + Where to deduplicate 'similar' record. The similarity of the records is |
| 186 | + calculated using the `SequenceMatcher` from `difflib`. By default False. |
| 187 | + threshold : float, optional |
| 188 | + Threshold score above which two records are considered duplicate. |
| 189 | + By default 0.98. Only applies if `similar` is set to `True`. |
| 190 | + title_only : bool, optional |
| 191 | + Only use the title for deduplication, by default False |
| 192 | + stopwords_language : str, optional |
| 193 | + Remove stopwords from this language before deduplicating, for example 'english'. |
| 194 | + By default None. Only applies if `similar` is set to `True`. |
| 195 | + strict : bool, optional |
| 196 | + Use a stricter algorithm to calculate the similarity between records. |
| 197 | + By default False. Only applies if `similar` is set to `True`. |
| 198 | + verbose : bool, optional |
| 199 | + Get verbose output during deduplicating. By default False. Only applies if |
| 200 | + `similar` is set to `True`. |
| 201 | + """ |
174 | 202 | initial_length = len(asdata.df)
|
175 | 203 |
|
176 |
| - if not args.similar: |
177 |
| - if args.pid not in asdata.df.columns: |
178 |
| - print( |
179 |
| - f'Not using {args.pid} for deduplication ' |
180 |
| - 'because there is no such data.' |
181 |
| - ) |
| 204 | + if not similar: |
| 205 | + if pid not in asdata.df.columns: |
| 206 | + print(f"Not using {pid} for deduplication because there is no such data.") |
182 | 207 |
|
183 | 208 | # retrieve deduplicated ASReview data object
|
184 |
| - asdata.drop_duplicates(pid=args.pid, inplace=True) |
| 209 | + asdata.drop_duplicates(pid=pid, inplace=True) |
185 | 210 |
|
186 | 211 | else:
|
187 | 212 | _drop_duplicates_by_similarity(
|
188 |
| - asdata, |
189 |
| - args.pid, |
190 |
| - args.threshold, |
191 |
| - args.title_only, |
192 |
| - args.stopwords, |
193 |
| - args.stopwords_language, |
194 |
| - args.strict, |
195 |
| - args.verbose, |
196 |
| - ) |
| 213 | + asdata=asdata, |
| 214 | + pid=pid, |
| 215 | + threshold=threshold, |
| 216 | + title_only=title_only, |
| 217 | + stopwords_language=stopwords_language, |
| 218 | + strict=strict, |
| 219 | + verbose=verbose, |
| 220 | + ) |
| 221 | + |
| 222 | + if output_path: |
| 223 | + asdata.to_file(output_path) |
197 | 224 |
|
198 | 225 | # count duplicates
|
199 | 226 | n_dup = initial_length - len(asdata.df)
|
200 |
| - |
201 |
| - if args.output_path: |
202 |
| - asdata.to_file(args.output_path) |
203 |
| - print( |
204 |
| - f'Removed {n_dup} duplicates from dataset with' |
205 |
| - f' {initial_length} records.' |
206 |
| - ) |
207 |
| - else: |
208 |
| - print( |
209 |
| - f'Found {n_dup} duplicates in dataset with' |
210 |
| - f' {initial_length} records.' |
211 |
| - ) |
| 227 | + print(f"Found {n_dup} duplicates in dataset with {initial_length} records.") |
0 commit comments