Skip to content

Commit e28f309

Browse files
committed
Simplify logic
1 parent ac64092 commit e28f309

File tree

1 file changed

+5
-17
lines changed

1 file changed

+5
-17
lines changed

asreviewcontrib/datatools/dedup.py

+5-17
Original file line numberDiff line numberDiff line change
@@ -103,19 +103,14 @@ def _drop_duplicates_by_similarity(
103103
seq_matcher = SequenceMatcher()
104104
duplicated = [False] * len(s)
105105

106-
if verbose:
107-
similar_list = []
108-
else:
109-
similar_list = None
110-
106+
similar_list = []
111107
if pid in asdata.df.columns:
112108
if is_string_dtype(asdata.df[pid]) or is_object_dtype(asdata.df[pid]):
113109
pids = asdata.df[pid].str.strip().replace("", None)
114110
if pid == "doi":
115111
pids = pids.str.lower().str.replace(
116112
r"^https?://(www\.)?doi\.org/", "", regex=True
117113
)
118-
119114
else:
120115
pids = asdata.df[pid]
121116

@@ -136,14 +131,10 @@ def _drop_duplicates_by_similarity(
136131
and seq_matcher.quick_ratio() > threshold
137132
and (not strict or seq_matcher.ratio() > threshold)
138133
):
139-
if verbose and not duplicated[j]:
134+
if not duplicated[j]:
140135
similar_list.append((i, j))
141-
142136
duplicated[j] = True
143137

144-
if verbose:
145-
_print_similar_list(similar_list, data, pid, pids)
146-
147138
else:
148139
print(f"Not using {pid} for deduplication because there is no such data.")
149140

@@ -160,15 +151,12 @@ def _drop_duplicates_by_similarity(
160151
and seq_matcher.quick_ratio() > threshold
161152
and (not strict or seq_matcher.ratio() > threshold)
162153
):
163-
if verbose and not duplicated[j]:
154+
if not duplicated[j]:
164155
similar_list.append((i, j))
165-
166156
duplicated[j] = True
167-
168-
if verbose:
169-
_print_similar_list(similar_list, data, pid)
170-
171157
asdata.df = asdata.df[~pd.Series(duplicated)].reset_index(drop=True)
158+
if verbose:
159+
_print_similar_list(similar_list, data, pid)
172160

173161

174162
def deduplicate_data(

0 commit comments

Comments
 (0)