-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathmerge_datas.py
51 lines (42 loc) · 1.75 KB
/
merge_datas.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
import glob
import os
def merge_datas(data_path="domains/data/", output_filename="domains.txt"):
"""
Merges all .txt files found in the specified directory patterns within the given data_path:
- data_path/*/*.txt
- data_path/*.txt
- data_path/*/*/txt
Args:
data_path: The root directory to search for .txt files.
output_filename: The name of the file to write the merged content to.
"""
# Create a list of file paths matching the patterns, using the provided data_path
file_patterns = [
os.path.join(data_path, "*/*.txt"), # data_path/*/*.txt
os.path.join(data_path, "*.txt"), # data_path/*.txt
os.path.join(data_path, "*/*/*.txt"), # data_path/*/*/*.txt
]
file_paths = []
for pattern in file_patterns:
file_paths.extend(glob.glob(pattern))
# Check if any files were found
if not file_paths:
print(f"No .txt files found in '{data_path}' matching the specified patterns.")
return
# Merge the contents of the files
with open(output_filename, "w", encoding="utf-8") as outfile:
for file_path in file_paths:
try:
with open(file_path, "r", encoding="utf-8") as infile:
outfile.write(infile.read())
outfile.write(
"\n"
) # Add a newline to separate content from different files
print(f"Merged: {file_path}")
except UnicodeDecodeError:
print(f"Skipped file (likely binary): {file_path}")
except FileNotFoundError:
print(f"File not found: {file_path}")
print(f"Successfully merged files into: {output_filename}")
if __name__ == "__main__":
merge_datas()