global_options.py

"""Global options for analysis
"""
import os
from pathlib import Path
from typing import Dict, List, Optional, Set

# Hardware options
N_CORES: int = 4  # max number of CPU cores to use
RAM_CORENLP: str = "8G"  # max RAM allocated for parsing using CoreNLP
PARSE_CHUNK_SIZE: int = 100 # number of lines in the input file to process uing CoreNLP at once. Increase on workstations with larger RAM (e.g. to 1000 if RAM is 64G)  

# Directory locations
os.environ[
    "CORENLP_HOME"
] = "/Users/mai/stanford-corenlp-full-2020-04-20"  # location of the CoreNLP models
DATA_FOLDER: str = "data/"
MODEL_FOLDER: str = "models/" # will be created if does not exist
OUTPUT_FOLDER: str = "outputs/" # will be created if does not exist

# Parsing and analysis options
STOPWORDS: Set[str] = set(
    Path("resources", "StopWords_Generic.txt").read_text().lower().split()
)  # Set of stopwords from https://sraf.nd.edu/textual-analysis/resources/#StopWords
PHRASE_THRESHOLD: int = 10  # threshold of the phraser module (smaller -> more phrases)
PHRASE_MIN_COUNT: int = 10  # min number of times a bigram needs to appear in the corpus to be considered as a phrase
W2V_DIM: int = 300  # dimension of word2vec vectors
W2V_WINDOW: int = 5  # window size in word2vec
W2V_ITER: int = 20  # number of iterations in word2vec
N_WORDS_DIM: int = 500  # max number of words in each dimension of the dictionary
DICT_RESTRICT_VOCAB = None # change to a fraction number (e.g. 0.2) to restrict the dictionary vocab in the top 20% of most frequent vocab

# Inputs for constructing the expanded dictionary
DIMS: List[str] = ["integrity", "teamwork", "innovation", "respect", "quality"]
SEED_WORDS: Dict[str, List[str]] = {
    "integrity": [
        "integrity",
        "ethic",
        "ethical",
        "accountable",
        "accountability",
        "trust",
        "honesty",
        "honest",
        "honestly",
        "fairness",
        "responsibility",
        "responsible",
        "transparency",
        "transparent",
    ],
    "teamwork": [
        "teamwork",
        "collaboration",
        "collaborate",
        "collaborative",
        "cooperation",
        "cooperate",
        "cooperative",
    ],
    "innovation": [
        "innovation",
        "innovate",
        "innovative",
        "creativity",
        "creative",
        "create",
        "passion",
        "passionate",
        "efficiency",
        "efficient",
        "excellence",
        "pride",
    ],
    "respect": [
        "respectful",
        "talent",
        "talented",
        "employee",
        "dignity",
        "empowerment",
        "empower",
    ],
    "quality": [
        "quality",
        "customer",
        "customer_commitment",
        "dedication",
        "dedicated",
        "dedicate",
        "customer_expectation",
    ],
    "cybersecurity": [
        "cybersecurity",
        "security",
        "encryption",
        "tokenization",
        "biometrics",
    ],
    "mobile transaction": [
        "mobile transaction",
        "smartphone",
        "digital",
        "wallet",
    ],
    "data analytics": [
        "data analytics",
        "data science",
        "data",
        "big data",
        "cloud computing",
        "cloud",
        "AI",
        "artificial intelligence",
        "machine learning",
        "deep learning",
        "business intelligence",
        "BI",
    ],
    "blockchain": [
        "blockchain",
        "crypto",
        "cryptocurrency",
        "smart contracts",
    ],
    "peer to peer": [
        "peer to peer",
        "P2P",
        "consumer to customer",
        "crowdfunding",
    ],
    "internet of things": [
        "internet of things",
        "smart devices",
        "devices",
        "networks",
        "wireless",
        "wireless sensor"
    ],
    "environment": [
        "environment",
        "climate change",
        "sustainability",
        "climate"
    ],
    "Social Concerns": [
        "social",
        "social concerns",
        "diversity",
        "human rights",
        "consumer proterction",
        "animal welfare",
        "welfare"
    ],
    "governance": [
        "governance",
        "corporate governance",
        "management",
        "structure",
        "management structure",
        "employee relations",
        "employtee compensation",
        "compensation",
        "responsible investment"
    ]
}

# Create directories if not exist
Path(DATA_FOLDER, "processed", "parsed").mkdir(parents=True, exist_ok=True)
Path(DATA_FOLDER, "processed", "unigram").mkdir(parents=True, exist_ok=True)
Path(DATA_FOLDER, "processed", "bigram").mkdir(parents=True, exist_ok=True)
Path(DATA_FOLDER, "processed", "trigram").mkdir(parents=True, exist_ok=True)
Path(MODEL_FOLDER, "phrases").mkdir(parents=True, exist_ok=True)
Path(MODEL_FOLDER, "phrases").mkdir(parents=True, exist_ok=True)
Path(MODEL_FOLDER, "w2v").mkdir(parents=True, exist_ok=True)
Path(OUTPUT_FOLDER, "dict").mkdir(parents=True, exist_ok=True)
Path(OUTPUT_FOLDER, "scores").mkdir(parents=True, exist_ok=True)
Path(OUTPUT_FOLDER, "scores", "temp").mkdir(parents=True, exist_ok=True)
Path(OUTPUT_FOLDER, "scores", "word_contributions").mkdir(parents=True, exist_ok=True)