Word_Frequency_Counter/count_word_freq.py

import argparse
from nltk.corpus import stopwords
from nltk.probability import FreqDist
from nltk.tokenize import word_tokenize
import re
import string


def preprocess(text: str) -> str:
    """
    Pre-process the input text.
    
    - Remove punctuation
    - Remove numbers
    - Lowercase
    
    :param text: text to pre-process
    :return: the pre-processed text
    """
    # Lowercase.
    text = text.lower()
    # Remove numbers.
    text = re.sub(r"[0-9]+", "", text)
    # Remove punctuation.
    text = text.translate(str.maketrans("", "", string.punctuation))
    return text


def run(text: str) -> FreqDist:
    """
    Count the word frequencies in a text.
    
    The text is pre-processed beforehand to remove uninformative
    tokens such as punctuation, numbers, stopwords, and to unify
    the same tokens by lowercasing the text.
    
    :param text: text to count the word frequencies in
    :return: the word frequencies in the text
    """
    # Pre-process the text.
    text = preprocess(text)
    # Tokenize the text.
    tokens = word_tokenize(text)
    # Remove stopwords.
    stop_words = set(stopwords.words("english"))
    tokens = [token for token in tokens if token not in stop_words]
    # Count the frequencies.
    freq_dist = FreqDist(tokens)
    print("Top 10 most frequent words:")
    print(freq_dist.most_common(10))
    return freq_dist
    
    
if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--filepath",
        "-f",
        required=True,
        help="path to the text file"
    )
    args = parser.parse_args()
    # Open the text file.
    with open(args.filepath, "r") as f:
        text = f.read()
    # Count the frequencies.
    freq_dist = run(text)
    freq_dist_str = "\n".join([str(x) for x in freq_dist.most_common(freq_dist.B())])
    # Save the result.
    old_file_name = args.filepath.split("/")[-1].split(".")[0]
    new_file_name = old_file_name + "_freq_dist"
    new_filepath = args.filepath.replace(old_file_name, new_file_name)
    with open(new_filepath, "w") as f:
        f.write(freq_dist_str)
    print(f"\nSaved the word frequencies to '{new_filepath}'")