-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathword_counter.py
31 lines (24 loc) · 1.02 KB
/
word_counter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
import os
from collections import Counter
# Function to read all text files in a folder and join their contents
def read_files_and_join(folder_path):
all_text = ""
for filename in os.listdir(folder_path):
if filename.endswith(".txt"):
with open(os.path.join(folder_path, filename), 'r', encoding='utf-8') as file:
all_text += file.read() + " " # Adding a space between the contents of each file
return all_text
# Function to find the top 100 most used words
def top_100_words(text):
# Tokenize the text and count occurrences of each word
word_counts = Counter(text.split())
# Extract the top 100 most common words
top_100 = word_counts.most_common(100)
return top_100
# Main function
folder_path = r"data\main_tokenized" # Specify the path to your folder containing text files
all_text = read_files_and_join(folder_path)
top_100 = top_100_words(all_text)
print("Top 100 most used words:")
for i, (word, count) in enumerate(top_100, 1):
print(f"{i}. {word} - {count}")