@@ -103,19 +103,14 @@ def _drop_duplicates_by_similarity(
103
103
seq_matcher = SequenceMatcher ()
104
104
duplicated = [False ] * len (s )
105
105
106
- if verbose :
107
- similar_list = []
108
- else :
109
- similar_list = None
110
-
106
+ similar_list = []
111
107
if pid in asdata .df .columns :
112
108
if is_string_dtype (asdata .df [pid ]) or is_object_dtype (asdata .df [pid ]):
113
109
pids = asdata .df [pid ].str .strip ().replace ("" , None )
114
110
if pid == "doi" :
115
111
pids = pids .str .lower ().str .replace (
116
112
r"^https?://(www\.)?doi\.org/" , "" , regex = True
117
113
)
118
-
119
114
else :
120
115
pids = asdata .df [pid ]
121
116
@@ -136,14 +131,10 @@ def _drop_duplicates_by_similarity(
136
131
and seq_matcher .quick_ratio () > threshold
137
132
and (not strict or seq_matcher .ratio () > threshold )
138
133
):
139
- if verbose and not duplicated [j ]:
134
+ if not duplicated [j ]:
140
135
similar_list .append ((i , j ))
141
-
142
136
duplicated [j ] = True
143
137
144
- if verbose :
145
- _print_similar_list (similar_list , data , pid , pids )
146
-
147
138
else :
148
139
print (f"Not using { pid } for deduplication because there is no such data." )
149
140
@@ -160,15 +151,12 @@ def _drop_duplicates_by_similarity(
160
151
and seq_matcher .quick_ratio () > threshold
161
152
and (not strict or seq_matcher .ratio () > threshold )
162
153
):
163
- if verbose and not duplicated [j ]:
154
+ if not duplicated [j ]:
164
155
similar_list .append ((i , j ))
165
-
166
156
duplicated [j ] = True
167
-
168
- if verbose :
169
- _print_similar_list (similar_list , data , pid )
170
-
171
157
asdata .df = asdata .df [~ pd .Series (duplicated )].reset_index (drop = True )
158
+ if verbose :
159
+ _print_similar_list (similar_list , data , pid )
172
160
173
161
174
162
def deduplicate_data (
0 commit comments