diff --git a/searchindex.js b/searchindex.js index b5d314fd..f70cbaf2 100644 --- a/searchindex.js +++ b/searchindex.js @@ -1 +1 @@ -Search.setIndex({"docnames": ["coherence", "dependencydistance", "descriptivestats", "extractors", "faq", "index", "information_theory", "installation", "news", "posstats", "quality", "readability", "tutorial", "tutorials/filter_corpus_using_quality", "tutorials/introductory_tutorial", "tutorials/sklearn_integration", "usingthepackage"], "filenames": ["coherence.rst", "dependencydistance.rst", "descriptivestats.rst", "extractors.rst", "faq.rst", "index.rst", "information_theory.rst", "installation.rst", "news.rst", "posstats.rst", "quality.rst", "readability.rst", "tutorial.rst", "tutorials/filter_corpus_using_quality.ipynb", "tutorials/introductory_tutorial.ipynb", "tutorials/sklearn_integration.ipynb", "usingthepackage.rst"], "titles": ["Coherence", "Dependency Distance", "Descriptive Statistics", "Extractor", "Frequently Asked Questions", "TextDescriptives", "Information Theory", "Installation", "News and Changelog", "Part-of-Speech Proportions", "Quality", "Readability", "Tutorials", "Filtering corpora using Quality", "Introductory Tutorial", "Scikit-learn Integration", "Quick Start"], "terms": {"The": [0, 1, 2, 3, 5, 6, 9, 10, 11, 15, 16], "calcul": [0, 1, 3, 5, 6, 10, 11, 14, 16], "document": [0, 2, 3, 5, 6, 8, 9, 10, 11, 13, 14, 16], "base": [0, 3, 6, 11, 13, 16], "word": [0, 6, 10, 11, 13, 16], "embed": 0, "cosin": 0, "similar": [0, 4, 13], "between": [0, 10, 13, 14], "sentenc": [0, 1, 2, 6, 9, 11, 13, 16], "textdescript": [0, 1, 2, 3, 6, 7, 8, 9, 10, 11, 13, 15, 16], "current": [0, 7], "implement": [0, 1, 8, 10, 11, 13, 15], "first": [0, 11, 13, 16], "order": [0, 12, 16], "second": [0, 16], "follow": [0, 1, 2, 4, 5, 6, 7, 9, 10, 11, 13, 16], "e": [0, 4, 6, 7, 8, 10, 13, 14, 15, 16], "g": [0, 4, 7, 8, 10, 13, 14, 15, 16], "1": [0, 1, 2, 6, 8, 10, 11, 13, 14, 15, 16], "2": [0, 10, 11, 13, 14, 15], "consecut": 0, "ar": [0, 1, 2, 3, 4, 5, 9, 10, 11, 13, 14, 15, 16], "two": [0, 5], "apart": 0, "us": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 15], "spaci": [0, 1, 2, 3, 5, 6, 9, 10, 11, 13], "": [0, 5, 9, 10, 11, 13, 14, 15, 16], "span": [0, 1, 2, 6, 9, 10, 16], "method": [0, 10, 15], "i": [0, 1, 2, 3, 5, 6, 8, 9, 10, 11, 12, 14, 15, 16], "pipelin": [0, 1, 3, 5, 6, 9, 10, 11, 13, 14, 15, 16], "small": [0, 10, 13, 14], "medium": [0, 10], "larg": [0, 5, 8, 10, 13, 14], "transform": [0, 10], "model": [0, 3, 6, 10, 13, 14, 15, 16], "have": [0, 8, 10, 13, 14, 16], "differ": [0, 13, 14], "result": 0, "If": [0, 3, 4, 5, 9, 10, 11, 13, 14, 16], "you": [0, 4, 5, 7, 10, 12, 13, 14, 15, 16], "want": [0, 4, 10, 13, 14, 15, 16], "specif": [0, 4, 7, 13], "fasttext": 0, "should": [0, 4, 8, 10, 13], "overwrit": [0, 10, 13], "doc": [0, 1, 2, 3, 4, 6, 7, 9, 10, 11, 13, 14, 16], "vector": 0, "attribut": [0, 1, 2, 6, 9, 10, 11, 13], "read": [0, 11, 13, 16], "more": [0, 1, 3, 6, 10, 11, 13, 14, 16], "ad": [0, 1, 2, 6, 8, 9, 10, 11], "object": [0, 1, 2, 3, 6, 9, 10, 11, 14, 16], "_": [0, 1, 2, 6, 9, 10, 11, 13, 16], "first_order_coherence_valu": 0, "A": [0, 3, 5, 6, 10, 11, 13], "list": [0, 3, 10, 12, 13, 15], "float": [0, 10, 16], "where": [0, 6, 11], "each": [0, 1, 3, 5, 6, 7, 9, 12, 13, 16], "second_order_coherence_valu": 0, "coherer": 0, "dict": [0, 1, 2, 3, 9, 10, 11, 16], "contain": [0, 3, 5, 10, 13, 14, 15, 16], "mean": [0, 1, 2, 10, 13, 15, 16], "valu": [0, 6, 10, 11, 13, 15, 16], "kei": [0, 6, 9, 14], "first_order_coher": 0, "second_order_coher": 0, "bedi": 0, "carrillo": 0, "f": [0, 10, 13], "cecchi": 0, "slezak": 0, "d": 0, "sigman": 0, "m": [0, 4, 10, 14], "mota": 0, "n": [0, 6, 10, 13], "b": 0, "ribeiro": 0, "javitt": 0, "c": [0, 4, 10, 14, 15], "copelli": 0, "corcoran": 0, "2015": 0, "autom": [0, 11, 16], "analysi": [0, 10, 15], "free": [0, 14, 15], "speech": [0, 13, 16], "predict": [0, 6], "psychosi": 0, "onset": 0, "high": [0, 11, 14], "risk": 0, "youth": 0, "npj": 0, "schizophrenia": 0, "articl": 0, "http": [0, 7, 11, 13], "doi": 0, "org": [0, 11], "10": [0, 10, 13, 14, 15], "1038": 0, "npjschz": 0, "30": [0, 10, 11], "parola": 0, "lin": 0, "j": [0, 10], "simonsen": 0, "blikst": 0, "v": [0, 5], "zhou": 0, "y": [0, 14], "wang": 0, "h": [0, 6], "inou": 0, "l": [0, 11], "koelkebeck": 0, "k": [0, 10], "fusaroli": 0, "r": [0, 4, 7], "2022": [0, 8], "disturb": 0, "assess": 0, "cross": 0, "linguist": 0, "generaliz": 0, "nlp": [0, 1, 2, 3, 6, 8, 9, 10, 11, 13, 14, 16], "measur": [0, 1, 6, 11, 16], "research": [0, 5], "1016": 0, "schre": 0, "07": 0, "002": 0, "import": [0, 1, 2, 6, 9, 10, 11, 13, 14, 15, 16], "td": [0, 1, 2, 6, 9, 10, 11, 13, 14, 16], "load": [0, 1, 2, 6, 8, 9, 10, 11, 13, 14, 15, 16], "en_core_web_lg": [0, 6, 16], "add_pip": [0, 1, 2, 6, 8, 9, 10, 11, 13, 14, 16], "world": [0, 1, 2, 9, 10, 11, 16], "chang": [0, 1, 2, 8, 9, 10, 11, 16], "feel": [0, 1, 2, 9, 10, 11, 16], "water": [0, 1, 2, 9, 10, 11, 16], "earth": [0, 1, 2, 9, 10, 11, 16], "smell": [0, 1, 2, 9, 10, 11, 16], "air": [0, 1, 2, 9, 10, 11, 16], "much": [0, 1, 2, 9, 10, 11, 16], "onc": [0, 1, 2, 8, 9, 10, 11, 16], "wa": [0, 1, 2, 9, 10, 11, 13, 16], "lost": [0, 1, 2, 9, 10, 11, 16], "none": [0, 1, 2, 3, 9, 10, 11, 13, 16], "now": [0, 1, 2, 8, 9, 10, 11, 14, 15, 16], "live": [0, 1, 2, 9, 10, 11, 14, 15, 16], "who": [0, 1, 2, 9, 10, 11, 16], "rememb": [0, 1, 2, 9, 10, 11, 16], "all": [0, 1, 2, 3, 4, 6, 8, 9, 10, 11, 13, 14, 16], "store": [0, 1, 2, 9, 10, 11], "also": [0, 4, 6, 10, 12, 13, 16], "separ": 0, "note": [0, 4, 10, 11], "do": [0, 10, 13, 14], "ani": [0, 3, 10, 11, 14], "respect": 0, "thei": [0, 10, 13, 16], "requir": [0, 4, 6, 7, 10, 11, 13, 15], "averag": [0, 1, 10, 11], "over": [0, 1, 11], "extract": [0, 1, 2, 3, 6, 9, 10, 11, 15, 16], "datafram": [0, 1, 2, 3, 6, 9, 10, 11, 14, 15, 16], "extract_df": [0, 1, 2, 3, 6, 9, 10, 11, 14, 16], "text": [0, 1, 2, 3, 5, 6, 8, 9, 10, 11, 14, 15, 16], "0": [0, 1, 2, 6, 8, 9, 10, 11, 13, 14, 15], "780735": 0, "749475": 0, "create_coherence_compon": 0, "languag": [0, 1, 2, 3, 6, 9, 10, 11, 14], "name": [0, 1, 2, 6, 9, 10, 11, 14, 15], "str": [0, 1, 2, 3, 6, 9, 10, 11], "callabl": [0, 1, 2, 9, 10, 11], "sourc": [0, 1, 2, 3, 6, 9, 10, 11, 13], "allow": [0, 1, 2, 6, 9, 10, 11, 13, 15], "pipe": [0, 1, 2, 3, 8, 9, 10, 11, 13, 15, 16], "thi": [0, 1, 2, 6, 8, 9, 10, 11, 13, 14, 15, 16], "set": [0, 1, 3, 6, 8, 9, 10, 11, 13, 14, 15, 16], "paramet": [0, 1, 2, 3, 6, 9, 10, 11, 14, 16], "doe": [0, 1, 2, 9, 10, 11, 13], "need": [0, 1, 2, 9, 10, 11, 13, 15], "specifi": [0, 1, 2, 4, 9, 10, 11, 14, 16], "call": [0, 1, 2, 3, 9, 10, 11], "can": [0, 1, 2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16], "option": [0, 1, 2, 3, 7, 9, 10, 11, 13, 14, 15], "argument": [0, 1, 2, 9, 10, 11, 13, 15, 16], "return": [0, 1, 2, 3, 9, 10, 11, 13], "type": [0, 1, 2, 3, 5, 9, 10, 11, 13], "exampl": [0, 1, 2, 6, 8, 9, 10, 11, 13, 15], "en_core_web_md": 0, "appli": [0, 1, 2, 9, 11, 13], "anoth": [0, 2], "get": [0, 5, 7, 12, 13, 14, 15], "dependency_dist": [1, 3, 14, 16], "add": [1, 2, 3, 9, 10, 11, 13, 16], "both": 1, "token": [1, 2, 6, 9, 10, 13, 14, 16], "under": [1, 10, 11], "syntact": 1, "complex": [1, 6, 11], "greater": 1, "liu": [1, 10], "2008": 1, "oya": 1, "2011": 1, "we": [1, 10, 11, 12, 13, 14, 15], "from": [1, 3, 4, 5, 6, 10, 11, 13, 14, 15], "take": [1, 13], "level": [1, 11], "pleas": [1, 4, 5, 13], "see": [1, 8, 13, 14, 16], "issu": [1, 4, 5], "how": [1, 5, 6, 11, 13, 14, 15], "metric": [1, 3, 5, 6, 8, 10, 11, 13, 15, 16], "propos": [1, 10], "For": [1, 10, 13, 14, 16], "standard": [1, 13, 14, 16], "deviat": [1, 14, 16], "along": 1, "proport": [1, 2, 8, 10, 11, 13, 16], "adjac": [1, 16], "relat": [1, 5, 8, 16], "whether": [1, 3, 10, 16], "an": [1, 3, 4, 6, 10, 11, 13, 14, 15, 16], "en_core_web_sm": [1, 2, 9, 10, 11, 14, 16], "access": [1, 15, 16], "same": [1, 15], "wai": 1, "3": [1, 2, 4, 5, 10, 11, 13, 14, 15], "dependency_distance_mean": 1, "dependency_distance_std": [1, 14], "prop_adjacent_dependency_relation_mean": [1, 14], "prop_adjacent_dependency_relation_std": [1, 14], "77524": 1, "553188": 1, "457143": 1, "0722806": 1, "create_dependency_distance_compon": 1, "creat": [1, 4, 9, 10, 13, 16], "factori": 1, "dependencydist": [1, 14], "descriptive_stat": [2, 3, 8, 11, 15, 16], "number": [2, 6, 10, 11, 14, 16], "count": [2, 13, 14, 16], "uniqu": [2, 14, 16], "charact": [2, 10, 11, 13, 14, 16], "sentence_length": [2, 16], "length": [2, 6, 10, 11, 13, 14, 16], "median": [2, 15, 16], "std": [2, 16], "syllabl": [2, 11, 16], "per": [2, 6, 11, 16], "token_length": [2, 16], "singl": [2, 3], "individu": [2, 16], "token_length_mean": [2, 11, 14, 15], "token_length_median": [2, 11, 14, 15], "token_length_std": [2, 11, 14, 15], "sentence_length_mean": [2, 11, 14, 15], "sentence_length_median": [2, 11, 14, 15], "sentence_length_std": [2, 11, 14, 15], "syllables_per_token_mean": [2, 11, 14, 15], "syllables_per_token_median": [2, 11, 14, 15], "syllables_per_token_std": [2, 11, 15], "n_token": [2, 11, 14, 15], "n_unique_token": [2, 11, 14, 15], "proportion_unique_token": [2, 11, 15], "n_charact": [2, 11, 14, 15], "n_sentenc": [2, 11, 15], "28571": [2, 11], "54127": [2, 11], "7": [2, 4, 8, 10, 11, 13, 14, 15], "6": [2, 10, 11, 13, 14, 15, 16], "09839": [2, 11], "08571": [2, 11], "368117": [2, 11], "35": [2, 11], "23": [2, 11], "657143": [2, 11], "121": [2, 11], "5": [2, 8, 10, 11, 13, 14, 15], "create_descriptive_stats_compon": 2, "verbos": [2, 11], "bool": [2, 3, 9, 10, 11], "descriptivestatist": [2, 11], "blank": [2, 6, 10, 11, 13, 16], "en": [2, 11, 13, 14, 15, 16], "stat": 2, "featur": [3, 5, 15], "extract_metr": [3, 15, 16], "meant": 3, "raw": 3, "wherea": 3, "extract_dict": [3, 16], "work": [3, 11, 13, 16], "union": [3, 10], "lang": [3, 14, 15, 16], "iter": 3, "spacy_model": [3, 14, 16], "spacy_model_s": [3, 14], "lg": 3, "panda": [3, 10, 15, 16], "provid": [3, 15], "automat": [3, 14, 16], "download": [3, 12, 13, 14, 16], "default": [3, 8, 9, 10, 11, 13, 16], "which": [3, 4, 6, 8, 10, 12, 13, 14, 15, 16], "One": 3, "readabl": [3, 5, 14, 16], "pos_proport": [3, 8, 9, 16], "coher": [3, 6, 16], "qualiti": [3, 8, 12, 16], "information_theori": [3, 6, 16], "one": [3, 6, 9, 10, 13, 16], "size": [3, 14], "row": [3, 14, 16], "column": [3, 14, 15, 16], "pd": [3, 15], "include_text": [3, 14, 16], "true": [3, 9, 10, 11, 13, 14], "gener": [3, 5, 13], "case": [3, 13], "compoen": 3, "ha": [3, 8], "been": [3, 8, 10], "dictionari": [3, 6, 10, 16], "entri": [3, 14, 15], "come": 4, "ensur": 4, "function": [4, 5, 10, 14, 15, 16], "To": [4, 5, 7, 12, 13, 14, 15, 16], "clone": [4, 7], "repositori": [4, 7, 13], "instal": [4, 5, 13, 14, 15], "pip": [4, 7, 13, 14, 15], "style": [4, 7], "txt": [4, 7], "depend": [4, 5, 7, 14, 16], "well": [4, 6, 13], "lint": [4, 7], "These": [4, 6, 10, 13], "pyproject": 4, "toml": 4, "file": 4, "pytest": 4, "folder": 4, "python": [4, 5, 14], "desired_test": 4, "py": [4, 13], "check": [4, 10, 13, 16], "coverag": 4, "cov": 4, "intend": 4, "major": [4, 13], "o": 4, "includ": [4, 6, 7, 10, 14, 16], "window": 4, "latest": 4, "version": [4, 13], "maco": 4, "linux": 4, "ubuntu": 4, "similarli": [4, 6], "8": [4, 10, 11, 13, 14, 15], "9": [4, 10, 13, 15], "onli": [4, 10, 13, 14, 15, 16], "system": 4, "being": 4, "activ": 4, "earlier": 4, "like": [4, 6, 13], "sphinx": 4, "It": [4, 6, 10, 11, 13], "furo": 4, "theme": 4, "custom": [4, 13], "make": [4, 14, 15], "extens": [4, 5, 10, 13], "html": 4, "librari": [5, 16], "varieti": [5, 13, 14], "statist": [5, 14, 16], "compon": [5, 8, 13, 14, 15], "sever": 5, "descript": [5, 16], "distanc": [5, 14, 16], "wish": [5, 10, 13], "try": [5, 13, 14, 15], "out": [5, 10], "packag": [5, 7, 12, 13, 14, 15, 16], "onlin": 5, "demo": 5, "organ": 5, "part": [5, 10, 13, 14, 16], "start": [5, 7, 10, 12, 13, 14], "instruct": 5, "guid": 5, "tutori": [5, 7, 15], "refer": [5, 10], "public": 5, "class": [5, 13], "report": 5, "request": 5, "github": [5, 7], "tracker": 5, "kindli": 5, "forum": [5, 13], "otherwis": [5, 16], "discuss": 5, "bug": 5, "idea": 5, "usag": 5, "your": [5, 7, 13, 14, 16], "cite": 5, "preprint": [5, 10], "index": [5, 11, 14, 16], "page": 5, "theoret": 6, "deriv": 6, "entropi": [6, 16], "shannon": 6, "prob": 6, "probabl": [6, 13], "defin": 6, "x": [6, 13, 14], "sum_": 6, "p": [6, 10], "x_i": 6, "log_": 6, "perplex": [6, 16], "distribut": [6, 13, 14], "sampl": [6, 13], "ppl": 6, "per_word_perplex": [6, 16], "divid": [6, 15], "se": 6, "consid": [6, 10], "normal": 6, "often": [6, 13], "describ": 6, "higher": [6, 11, 13], "could": [6, 10, 13, 14], "imagin": 6, "filter": [6, 8, 10, 12], "given": 6, "assumpt": 6, "highli": 6, "surpris": 6, "fact": 6, "non": 6, "piec": [6, 11], "avail": [6, 13, 14, 15], "lexem": 6, "prop": 6, "tabl": [6, 16], "warn": [6, 11, 13], "rais": [6, 11, 13], "np": [6, 11], "nan": [6, 11, 14], "cannot": 6, "found": [6, 14], "simpl": [6, 9, 11, 13, 14], "veri": [6, 13], "288195": 6, "334017": 6, "190574": 6, "create_information_theory_compon": 6, "informationtheori": 6, "command": [6, 7, 13], "run": [7, 10, 12, 13, 14, 16], "line": [7, 10], "termin": 7, "dependendic": 7, "build": [7, 14], "test": [7, 9, 10, 11, 13, 14, 15], "git": 7, "com": 7, "hlass": 7, "link": 7, "t": [7, 10, 13, 14, 15, 16], "extra": [7, 10], "v2": 8, "1st": 8, "januari": 8, "2023": 8, "renam": 8, "prefix": [8, 15, 16], "pos_stat": 8, "consist": [8, 13], "v1": 8, "21st": 8, "septemb": 8, "seri": [8, 13], "some": [8, 10, 13, 14, 16], "were": [8, 10, 13, 16], "rae": [8, 10], "et": [8, 10], "al": [8, 10], "2021": [8, 10], "raffel": [8, 10], "2020": [8, 10], "corpora": [8, 10, 12], "4th": 8, "mai": [8, 13], "minor": 8, "fix": 8, "bell": 8, "whistl": 8, "octob": 8, "po": [8, 9], "pos_": 8, "instead": [8, 10, 11, 13, 14], "tag_": 8, "behavior": 8, "use_tag": 8, "fals": [8, 9, 10, 13, 14, 15, 16], "when": [8, 13], "initialis": 8, "modul": [8, 11], "pos_prop_postag": 9, "tag": [9, 16], "postag": 9, "By": [9, 16], "possibl": 9, "behaviour": [9, 16], "turn": 9, "off": 9, "add_all_tag": 9, "initi": [9, 14], "pos_prop_adj": 9, "pos_prop_adp": 9, "pos_prop_adv": 9, "pos_prop_aux": 9, "pos_prop_cconj": 9, "pos_prop_det": 9, "pos_prop_intj": 9, "pos_prop_noun": 9, "pos_prop_num": 9, "pos_prop_part": 9, "pos_prop_pron": 9, "pos_prop_propn": 9, "pos_prop_punct": 9, "pos_prop_sconj": 9, "pos_prop_sym": 9, "pos_prop_verb": 9, "pos_prop_x": 9, "0243902": 9, "097561": 9, "0487805": 9, "0731707": 9, "121951": 9, "195122": 9, "146341": 9, "170732": 9, "create_pos_proportions_compon": 9, "use_po": 9, "pospropot": 9, "detail": [9, 16], "posproport": 9, "heurist": [10, 13], "stop": [10, 13], "n_stop_word": [10, 13], "alpha": [10, 13], "ratio": [10, 13], "alpha_ratio": [10, 13], "least": [10, 13], "alphabet": [10, 13], "mean_word_length": [10, 13], "ellipsi": 10, "proportion_ellipsi": [10, 13], "end": [10, 15], "bullet": 10, "point": [10, 14, 15], "proportion_bullet_point": [10, 13], "symbol": [10, 13], "symbol_": 10, "_2_word_ratio": 10, "hashtag": [10, 13], "curli": 10, "bracket": 10, "string": [10, 13], "contains_": 10, "instanc": [10, 13], "lorem": [10, 13], "ipsum": [10, 13], "vocabulari": 10, "oov_ratio": [10, 13], "total": [10, 11, 13], "repetiti": 10, "duplic": [10, 13], "fraction": [10, 13], "duplicate_lines_chr_fract": 10, "within": 10, "paragraph": [10, 13], "duplicate_paragraphs_chr_fract": 10, "gram": [10, 13], "duplicate_": 10, "_gram_chr_fract": 10, "rang": [10, 13], "top": [10, 14], "top_": 10, "pre": 10, "train": [10, 13, 15], "optim": 10, "speed": 10, "rather": 10, "usabl": 10, "simplic": 10, "integr": [10, 12, 16], "corpu": [10, 13, 14], "danish": [10, 13], "foundat": 10, "other": [10, 13, 16], "dedupl": 10, "strategi": [10, 15], "pass": [10, 13, 14, 15, 16], "passed_quality_check": [10, 13, 16], "doc_length": [10, 13], "duplicate_line_chr_fract": [10, 13], "duplicate_paragraph_chr_fract": [10, 13], "duplicate_5": 10, "gram_chr_fract": [10, 13], "duplicate_6": 10, "duplicate_7": 10, "duplicate_8": 10, "duplicate_9": 10, "duplicate_10": [10, 13], "top_2": 10, "top_3": 10, "top_4": 10, "contains_lorem": 10, "24": 10, "853659": 10, "95122": 10, "41": 10, "232258": 10, "0580645": 10, "174194": 10, "threshold": [10, 13], "so": [10, 13, 14, 15], "qualitythreshold": [10, 13], "just": [10, 11, 13], "upper": [10, 13], "bound": [10, 13], "100000": [10, 13], "symbol_to_word_ratio": [10, 13], "duplicate_ngram_chr_fract": [10, 13], "15": [10, 11, 13, 14], "14": [10, 13, 14], "13": [10, 13, 14, 15], "12": [10, 11, 13, 15], "11": [10, 11, 13, 15], "top_ngram_chr_fract": [10, 13], "18": [10, 11, 13], "4": [10, 11, 13, 14, 15], "16": [10, 13], "quality_pip": [10, 13], "set_quality_threshold": [10, 13], "updat": [10, 13], "create_quality_compon": 10, "top_ngram_rang": 10, "tupl": 10, "int": 10, "top_ngram_min_count": 10, "duplicate_n_gram_fraction_rang": 10, "vocab": 10, "map": 10, "forc": 10, "overwritten": 10, "split": [10, 13, 15], "rn": 10, "low": [10, 13], "w": 10, "borgeaud": 10, "cai": 10, "millican": 10, "hoffmann": 10, "song": 10, "irv": 10, "scale": 10, "insight": 10, "gopher": 10, "arxiv": 10, "2112": 10, "11446": 10, "shazeer": 10, "robert": 10, "lee": 10, "narang": 10, "matena": 10, "explor": [10, 13], "limit": 10, "transfer": 10, "learn": [10, 12], "unifi": 10, "mach": 10, "re": [10, 16], "21": [10, 11], "140": 10, "67": 10, "them": [10, 14, 15, 16], "minimum": 10, "time": [10, 13, 14, 15], "must": [10, 13], "occur": 10, "exist": [10, 13], "spacy_qu": 10, "pydant": 10, "quality_data_class": 10, "config": 10, "forbid": 10, "field": 10, "70": 10, "lower": [10, 11, 13], "origin": [10, 11, 14], "account": 10, "adiffer": 10, "definit": 10, "boundari": 10, "punctuat": 10, "isnot": 10, "100_000": [10, 13], "most": [10, 13], "20": 10, "80": [10, 11], "symboloccurr": 10, "occurr": 10, "100": [10, 11, 13], "appear": [10, 13], "than": [10, 11, 13], "qualityoutput": [10, 13], "output": [10, 13, 15, 16], "thresholdsoutput": [10, 13], "presenc": 10, "to_flat_value_dict": 10, "flat": 10, "represent": 10, "easi": [10, 11, 13], "convers": 10, "properti": 10, "three": [10, 11, 13], "item": 10, "either": 10, "interv": 10, "accept": 10, "boolean": [10, 14, 16], "t_out": 10, "readabiltii": [11, 16], "hyphen": 11, "pyphen": 11, "support": 11, "gun": [11, 16], "fog": [11, 16], "wikipedia": 11, "wiki": 11, "gunning_fog_index": 11, "__": 11, "develop": 11, "english": [11, 14], "write": 11, "estim": 11, "year": 11, "formal": 11, "educ": 11, "understand": 11, "u": [11, 13, 14, 15], "school": 11, "senior": 11, "around": [11, 13], "old": 11, "formula": 11, "grade": [11, 16], "asl": 11, "phw": 11, "percentag": 11, "hard": 11, "smog": [11, 14, 16], "gobbledygook": 11, "primarili": 11, "focus": 11, "polysyllab": 11, "043": 11, "1291": 11, "flesch": [11, 16], "eas": [11, 16], "e2": 11, "93kincaid_readability_test": 11, "flesch_reading_eas": [11, 14], "score": [11, 13, 15, 16], "indic": [11, 16], "easier": [11, 15], "while": [11, 13], "difficult": 11, "206": 11, "835": 11, "015": 11, "84": 11, "asw": 11, "kincaid": [11, 16], "93kincaid_grade_level": 11, "comprehend": 11, "39": 11, "59": 11, "automated_readability_index": [11, 14], "approxim": 11, "ari": 11, "71": 11, "n_char": 11, "n_word": 11, "43": 11, "coleman": [11, 16], "liau": [11, 16], "93liau_index": 11, "___": 11, "letter": 11, "cli": 11, "0588": 11, "296": 11, "In": [11, 13, 14, 15], "our": [11, 13, 14], "entir": 11, "lix": [11, 14, 16], "lix_": 11, "readability_test": 11, "lesbarhetsindex": 11, "long": [11, 13], "six": 11, "n_long_word": 11, "rix": [11, 14, 16], "www": 11, "jstor": 11, "stabl": 11, "40031755": 11, "difficulti": 11, "flesch_kincaid_grad": [11, 14], "gunning_fog": [11, 14], "coleman_liau_index": [11, 14], "107": 11, "879": 11, "0485714": 11, "68392": 11, "94286": 11, "45429": 11, "708571": 11, "7143": 11, "create_readability_compon": 11, "alreadi": [11, 14, 15], "toggl": 11, "show": [11, 13, 16], "messag": [11, 13, 14, 15], "recommend": 12, "go": [12, 14, 15], "through": [12, 13, 14, 15], "below": [12, 13, 16], "jupyt": 12, "notebook": 12, "local": [12, 13], "introductori": [12, 15], "scikit": 12, "mani": 13, "analys": [13, 14], "tweet": 13, "scrape": 13, "remov": [13, 15], "huggingfac": 13, "thu": 13, "Or": 13, "except": [13, 14, 15], "mc4": 13, "would": [13, 15], "ag": 13, "whole": [13, 16], "stream": 13, "down": 13, "1000": 13, "load_dataset": 13, "000": 13, "home": 13, "runner": 13, "lib": 13, "python3": 13, "site": 13, "1429": 13, "futurewarn": 13, "code": [13, 14], "execut": 13, "correctli": 13, "inspect": 13, "hf": 13, "co": 13, "avoid": 13, "futur": 13, "trust_remote_cod": 13, "mandatori": 13, "next": [13, 14], "releas": 13, "let": [13, 14, 15], "look": 13, "400": 13, "print": [13, 15], "post": 13, "362": 13, "info": 13, "okai": 13, "those": 13, "help": [13, 16], "laugh": 13, "ask": 13, "about": [13, 14], "ohm": 13, "power": 13, "lsi15": 13, "know": 13, "book": 13, "websit": 13, "someon": 13, "talk": 13, "seek": 13, "what": 13, "share": 13, "me": 13, "question": 13, "gain": 13, "audio": 13, "thats": 13, "Not": 13, "up": 13, "segment": 13, "here": 13, "quit": 13, "especi": 13, "simpli": [13, 14, 15], "examin": 13, "seem": 13, "did": 13, "why": 13, "435": 13, "79": 13, "52": 13, "894": 13, "42": [13, 15], "38": 13, "36": 13, "01": 13, "natur": 13, "might": 13, "easili": [13, 14], "realli": 13, "repitit": 13, "sign": 13, "max": 13, "quality_threshold": 13, "typic": 13, "interest": [13, 16], "filtered_text": 13, "len": 13, "process": 13, "572": 13, "lot": [13, 14], "howev": 13, "unreason": 13, "adjust": 13, "new_threshold": 13, "don": [13, 14, 15, 16], "dynam": 13, "new": 13, "tune": 13, "reason": 13, "bit": [13, 14], "further": 13, "section": [13, 16], "lead": 13, "problem": 13, "directli": 13, "gigaword": 13, "purpos": 13, "2500": 13, "collect": [13, 14, 15], "script": 13, "error": 13, "littl": 13, "ddsc": 13, "partial": 13, "ten_sampl": 13, "select": 13, "to_panda": 13, "As": [13, 15], "previous": 13, "mention": 13, "multipl": 13, "retsinformationdk": 13, "legal": 13, "hest": 13, "debat": 13, "nettet": 13, "dk": 13, "spont": 13, "transcrib": 13, "spontan": 13, "lambda": 13, "num_proc": 13, "tv2r": 13, "notabl": 13, "fewer": 13, "after": [13, 15], "prepar": 13, "extrem": 13, "thing": 13, "creas": 13, "da": [13, 16], "max_length": 13, "2000000": 13, "increas": 13, "worth": 13, "beforehand": 13, "legal_doc": 13, "effici": 13, "format": [13, 16], "50": 13, "uncommon": 13, "phrase": 13, "previou": 13, "close": 13, "mostli": 13, "made": 13, "good": 13, "later": 13, "common": 13, "assum": [13, 14], "keep": 13, "legal_docs_filt": 13, "had": 13, "That": [13, 14], "seaborn": [13, 14], "sn": [13, 14], "def": 13, "get_duplicate_10_gram_fract": 13, "duplicate_10_gram_fract": 13, "histplot": 13, "explain": 13, "everyth": 13, "relev": [13, 16], "few": [13, 14], "perculiar": 13, "behav": 13, "news_doc": 13, "speech_doc": 13, "news_alpha_ratio": 13, "speech_alpha_ratio": 13, "plot": [13, 14], "histogram": 13, "matplotlib": [13, 14], "pyplot": [13, 14], "plt": [13, 14], "label": [13, 14, 15], "binwidth": 13, "05": 13, "xlabel": [13, 14], "ylabel": [13, 14], "legend": 13, "coupl": 13, "fair": 13, "amount": 13, "abov": 13, "almost": 13, "suspic": 13, "depth": 13, "dento": 13, "speaker": 13, "alpa": 13, "problemat": 13, "task": 13, "hand": 13, "therefor": 13, "target": 13, "quickli": [14, 16], "spend": 14, "exactli": 14, "care": 14, "introduc": 14, "inform": [14, 15, 16], "ll": [14, 15], "quick": 14, "overview": 14, "sm": [14, 15], "spam": [14, 15], "dataset": [14, 15], "5572": [14, 15], "categor": [14, 15], "ham": [14, 15], "familiar": 14, "util": [14, 15], "load_sms_data": [14, 15], "df": [14, 15, 16], "head": [14, 15, 16], "until": [14, 15], "jurong": [14, 15], "crazi": [14, 15], "ok": [14, 15], "lar": [14, 15], "joke": [14, 15], "wif": [14, 15], "oni": [14, 15], "wkly": [14, 15], "comp": [14, 15], "win": [14, 15], "fa": [14, 15], "cup": [14, 15], "fina": [14, 15], "dun": [14, 15], "sai": [14, 15], "earli": [14, 15], "hor": [14, 15], "nah": [14, 15], "think": [14, 15], "he": [14, 15, 16], "goe": [14, 15], "usf": [14, 15], "aro": [14, 15], "value_count": 14, "4825": 14, "747": 14, "dtype": 14, "int64": 14, "handl": 14, "boilerpl": 14, "haven": 14, "befor": 14, "appropri": [14, 16], "en_core_web_model": 14, "altern": 14, "join": 14, "metrics_df": 14, "drop": [14, 15], "150000": 14, "904600": 14, "250000": 14, "388889": 14, "055556": 14, "90": 14, "935000": 14, "060000": 14, "116500": 14, "642000": 14, "000000": 14, "527525": 14, "166667": 14, "433333": 14, "233333": 14, "105": 14, "090000": 14, "653333": 14, "800000": 14, "026667": 14, "392857": 14, "298693": 14, "178571": 14, "521008": 14, "050420": 14, "92": 14, "917857": 14, "777143": 14, "260357": 14, "915714": 14, "28": 14, "285714": 14, "758098": 14, "535714": 14, "035714": 14, "116": 14, "652500": 14, "645000": 14, "550000": 14, "541818": 14, "590909": 14, "461538": 14, "598816": 14, "400000": 14, "109": 14, "040000": 14, "280000": 14, "373846": 14, "276923": 14, "larger": 14, "yet": 14, "0x7f4a4c45aa10": 14, "whenev": 14, "spacy_pipe_metrics_df": 14, "left": 14, "assur": 14, "ourselv": 14, "ident": 14, "equal": 14, "With": 14, "sens": 14, "boxplot": 14, "ax": 14, "correl": 14, "strongli": 14, "encod": 14, "is_ham": 14, "comput": 14, "metrics_correl": 14, "corrwith": 14, "sort_valu": [14, 15], "ab": 14, "ascend": [14, 15], "409000": 14, "408325": 14, "363069": 14, "322233": 14, "322176": 14, "303665": 14, "298789": 14, "296832": 14, "294734": 14, "294251": 14, "float64": 14, "pretti": 14, "kde": 14, "fig": 14, "subplot": 14, "figsiz": 14, "sharei": 14, "enumer": 14, "kdeplot": 14, "hue": 14, "cool": 14, "ve": 14, "done": 14, "actual": 14, "step": 14, "continu": 14, "classifi": [14, 15], "extractor": 15, "saw": 15, "exploratori": 15, "data": 15, "walk": 15, "sklearn": 15, "classif": 15, "instanti": 15, "textdecriptivesfeatur": 15, "wrap": 15, "alright": 15, "sklearn_featur": 15, "textdescriptivesfeatur": 15, "descriptive_stats_extractor": 15, "sure": 15, "columntransform": 15, "necessari": [15, 16], "oper": 15, "miss": 15, "simpleimput": 15, "imput": 15, "randomforestclassifi": 15, "evalu": 15, "ensembl": 15, "compos": 15, "model_select": 15, "train_test_split": 15, "set_config": 15, "tell": 15, "transform_output": 15, "text_process": 15, "text_processing__": 15, "verbose_feature_names_out": 15, "x_train": 15, "x_test": 15, "y_train": 15, "y_test": 15, "axi": 15, "test_siz": 15, "random_st": 15, "fit": 15, "accuraci": 15, "9443946188340807": 15, "nice": 15, "get_features_out": 15, "feature_importance_map": 15, "zip": 15, "feature_names_in_": 15, "named_step": 15, "feature_importances_": 15, "sort": 15, "df_import": 15, "195236": 15, "147239": 15, "125203": 15, "092848": 15, "075665": 15, "056021": 15, "054586": 15, "053780": 15, "050374": 15, "039930": 15, "039297": 15, "037317": 15, "030786": 15, "001717": 15, "desir": 16, "auto": 16, "vice": 16, "versa": 16, "syntax": 16, "shorthand": 16, "conveni": 16, "control": 16, "extract_x": 16, "felt": 16, "hi": 16, "life": 16, "kind": 16, "dream": 16, "sometim": 16, "wonder": 16, "whose": 16, "enjoi": 16, "parser": 16, "textdesript": 16, "jeg": 16, "var": 16, "atten": 16, "tog": 16, "patent": 16, "p\u00e5": 16, "ild": 16, "det": 16, "skull": 16, "sener": 16, "vise": 16, "sig": 16, "blive": 16, "meget": 16, "indbringend": 16, "forretn": 16, "spi": 16, "skovsneglen": 16, "mull": 16, "du": 16, "vil": 16, "jo": 16, "gern": 16, "v\u00e6re": 16, "med": 16, "hulen": 16, "ikk": 16, "clever": 16, "enough": 16, "textdecript": 16, "theori": 16}, "objects": {"textdescriptives.components.coherence": [[0, 0, 1, "", "create_coherence_component"]], "textdescriptives.components.dependency_distance": [[1, 0, 1, "", "create_dependency_distance_component"]], "textdescriptives.components.descriptive_stats": [[2, 0, 1, "", "create_descriptive_stats_component"]], "textdescriptives.components.information_theory": [[6, 0, 1, "", "create_information_theory_component"]], "textdescriptives.components.pos_proportions": [[9, 0, 1, "", "create_pos_proportions_component"]], "textdescriptives.components.quality": [[10, 0, 1, "", "create_quality_component"]], "textdescriptives.components.quality_data_classes": [[10, 1, 1, "", "QualityOutput"], [10, 1, 1, "", "QualityThresholds"], [10, 1, 1, "", "ThresholdsOutput"]], "textdescriptives.components.quality_data_classes.QualityOutput": [[10, 2, 1, "", "alpha_ratio"], [10, 2, 1, "", "contains"], [10, 2, 1, "", "doc_length"], [10, 2, 1, "", "duplicate_line_chr_fraction"], [10, 2, 1, "", "duplicate_ngram_chr_fraction"], [10, 2, 1, "", "duplicate_paragraph_chr_fraction"], [10, 2, 1, "", "mean_word_length"], [10, 2, 1, "", "n_stop_words"], [10, 2, 1, "", "oov_ratio"], [10, 3, 1, "", "passed"], [10, 2, 1, "", "proportion_bullet_points"], [10, 2, 1, "", "proportion_ellipsis"], [10, 2, 1, "", "symbol_to_word_ratio"], [10, 4, 1, "", "to_flat_value_dict"], [10, 2, 1, "", "top_ngram_chr_fraction"]], "textdescriptives.components.quality_data_classes.QualityThresholds": [[10, 2, 1, "", "alpha_ratio"], [10, 2, 1, "", "contains"], [10, 2, 1, "", "doc_length"], [10, 2, 1, "", "duplicate_line_chr_fraction"], [10, 2, 1, "", "duplicate_ngram_chr_fraction"], [10, 2, 1, "", "duplicate_paragraph_chr_fraction"], [10, 2, 1, "", "mean_word_length"], [10, 2, 1, "", "n_stop_words"], [10, 2, 1, "", "oov_ratio"], [10, 2, 1, "", "proportion_bullet_points"], [10, 2, 1, "", "proportion_ellipsis"], [10, 2, 1, "", "symbol_to_word_ratio"], [10, 2, 1, "", "top_ngram_chr_fraction"]], "textdescriptives.components.quality_data_classes.ThresholdsOutput": [[10, 3, 1, "", "passed"], [10, 2, 1, "", "threshold"], [10, 2, 1, "", "value"]], "textdescriptives.components.readability": [[11, 0, 1, "", "create_readability_component"]], "textdescriptives.extractors": [[3, 0, 1, "", "extract_df"], [3, 0, 1, "", "extract_dict"], [3, 0, 1, "", "extract_metrics"]]}, "objtypes": {"0": "py:function", "1": "py:pydantic_model", "2": "py:pydantic_field", "3": "py:property", "4": "py:method"}, "objnames": {"0": ["py", "function", "Python function"], "1": ["py", "pydantic_model", "Python model"], "2": ["py", "pydantic_field", "Python field"], "3": ["py", "property", "Python property"], "4": ["py", "method", "Python method"]}, "titleterms": {"coher": 0, "usag": [0, 1, 2, 6, 9, 10, 11, 16], "compon": [0, 1, 2, 6, 9, 10, 11, 16], "depend": 1, "distanc": 1, "descript": 2, "statist": 2, "extractor": 3, "api": 3, "frequent": 4, "ask": [4, 5], "question": [4, 5], "how": 4, "do": 4, "i": [4, 13], "test": 4, "code": 4, "run": 4, "suit": 4, "doe": 4, "thi": 4, "packag": 4, "x": 4, "document": 4, "gener": 4, "textdescript": [5, 14], "where": 5, "citat": 5, "indic": 5, "search": 5, "inform": 6, "theori": 6, "instal": 7, "develop": 7, "new": 8, "changelog": 8, "part": 9, "speech": 9, "proport": 9, "qualiti": [10, 13], "data": [10, 13, 14], "class": 10, "readabl": 11, "tutori": [12, 13, 14], "filter": 13, "corpora": 13, "us": [13, 14, 16], "setup": [13, 15], "web": 13, "content": 13, "The": [13, 14], "extract": [13, 14], "high": 13, "text": 13, "chang": 13, "compar": 13, "domain": 13, "note": 13, "dagw": 13, "dataset": 13, "current": 13, "unavail": 13, "due": 13, "copyright": 13, "disput": 13, "remaind": 13, "ha": 13, "been": 13, "disabl": 13, "now": 13, "convert": 13, "markdown": 13, "re": 13, "enabl": 13, "onc": 13, "settl": 13, "out": 13, "across": 13, "introductori": 14, "easi": 14, "wai": 14, "metric": 14, "extract_metr": 14, "configur": 14, "add": 14, "pipe": 14, "spaci": [14, 16], "exploratori": 14, "analysi": 14, "scikit": 15, "learn": 15, "integr": 15, "quick": 16, "start": 16, "specif": 16, "avail": 16, "attribut": 16}, "envversion": {"sphinx.domains.c": 2, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 8, "sphinx.domains.index": 1, "sphinx.domains.javascript": 2, "sphinx.domains.math": 2, "sphinx.domains.python": 3, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "sphinx.ext.viewcode": 1, "sphinx": 57}, "alltitles": {"Coherence": [[0, "coherence"]], "Usage": [[0, "usage"], [1, "usage"], [2, "usage"], [6, "usage"], [9, "usage"], [10, "usage"], [11, "usage"]], "Component": [[0, "component"], [1, "component"], [2, "component"], [6, "component"], [9, "component"], [10, "component"], [11, "component"]], "Dependency Distance": [[1, "dependency-distance"]], "Descriptive Statistics": [[2, "descriptive-statistics"]], "Extractor": [[3, "extractor"]], "API": [[3, "api"]], "Frequently Asked Questions": [[4, "frequently-asked-questions"]], "How do I test the code and run the test suite?": [[4, "how-do-i-test-the-code-and-run-the-test-suite"]], "Does this package run on X?": [[4, "does-this-package-run-on-x"]], "How is the documentation generated?": [[4, "how-is-the-documentation-generated"]], "TextDescriptives": [[5, "textdescriptives"]], "Where to ask questions?": [[5, "where-to-ask-questions"]], "Citation": [[5, "citation"]], "Indices and search": [[5, "indices-and-search"]], "Information Theory": [[6, "information-theory"]], "Installation": [[7, "installation"]], "Development Installation": [[7, "development-installation"]], "News and Changelog": [[8, "news-and-changelog"]], "Part-of-Speech Proportions": [[9, "part-of-speech-proportions"]], "Quality": [[10, "quality"]], "Data Classes": [[10, "data-classes"]], "Readability": [[11, "readability"]], "Tutorials": [[12, "tutorials"], [12, null]], "Filtering corpora using Quality": [[13, "filtering-corpora-using-quality"]], "Setup": [[13, "setup"], [15, "setup"]], "Filtering Web content": [[13, "filtering-web-content"]], "The Data": [[13, "the-data"]], "Filtering": [[13, "filtering"]], "Extracting high quality texts": [[13, "extracting-high-quality-texts"]], "Changing the filters": [[13, "changing-the-filters"]], "Comparing Domains": [[13, "comparing-domains"]], "Data": [[13, "data"]], "NOTE: The DAGW dataset is currently unavailable due to a copyright dispute. The remainder of the tutorial has been disabled for now (converted to markdown), and will be re-enabled once the dispute settles.": [[13, "note-the-dagw-dataset-is-currently-unavailable-due-to-a-copyright-dispute-the-remainder-of-the-tutorial-has-been-disabled-for-now-converted-to-markdown-and-will-be-re-enabled-once-the-dispute-settles"]], "Quality Filtering": [[13, "quality-filtering"]], "Filtering out the text": [[13, "filtering-out-the-text"]], "Comparing across domains": [[13, "comparing-across-domains"]], "Introductory Tutorial": [[14, "introductory-tutorial"]], "Using TextDescriptives": [[14, "using-textdescriptives"]], "The easy way: Extract metrics with extract_metrics": [[14, "the-easy-way-extract-metrics-with-extract-metrics"]], "The configurable way: Add pipes to spaCy": [[14, "the-configurable-way-add-pipes-to-spacy"]], "Exploratory Data Analysis": [[14, "exploratory-data-analysis"]], "Scikit-learn Integration": [[15, "scikit-learn-integration"]], "Quick Start": [[16, "quick-start"]], "Usage with spaCy": [[16, "usage-with-spacy"]], "Using Specific Components": [[16, "using-specific-components"]], "Available Attributes": [[16, "available-attributes"]]}, "indexentries": {"create_coherence_component() (in module textdescriptives.components.coherence)": [[0, "textdescriptives.components.coherence.create_coherence_component"]], "create_dependency_distance_component() (in module textdescriptives.components.dependency_distance)": [[1, "textdescriptives.components.dependency_distance.create_dependency_distance_component"]], "create_descriptive_stats_component() (in module textdescriptives.components.descriptive_stats)": [[2, "textdescriptives.components.descriptive_stats.create_descriptive_stats_component"]], "extract_df() (in module textdescriptives.extractors)": [[3, "textdescriptives.extractors.extract_df"]], "extract_dict() (in module textdescriptives.extractors)": [[3, "textdescriptives.extractors.extract_dict"]], "extract_metrics() (in module textdescriptives.extractors)": [[3, "textdescriptives.extractors.extract_metrics"]], "create_information_theory_component() (in module textdescriptives.components.information_theory)": [[6, "textdescriptives.components.information_theory.create_information_theory_component"]], "create_pos_proportions_component() (in module textdescriptives.components.pos_proportions)": [[9, "textdescriptives.components.pos_proportions.create_pos_proportions_component"]], "alpha_ratio (textdescriptives.components.quality_data_classes.qualityoutput attribute)": [[10, "textdescriptives.components.quality_data_classes.QualityOutput.alpha_ratio"]], "alpha_ratio (textdescriptives.components.quality_data_classes.qualitythresholds attribute)": [[10, "textdescriptives.components.quality_data_classes.QualityThresholds.alpha_ratio"]], "contains (textdescriptives.components.quality_data_classes.qualityoutput attribute)": [[10, "textdescriptives.components.quality_data_classes.QualityOutput.contains"]], "contains (textdescriptives.components.quality_data_classes.qualitythresholds attribute)": [[10, "textdescriptives.components.quality_data_classes.QualityThresholds.contains"]], "create_quality_component() (in module textdescriptives.components.quality)": [[10, "textdescriptives.components.quality.create_quality_component"]], "doc_length (textdescriptives.components.quality_data_classes.qualityoutput attribute)": [[10, "textdescriptives.components.quality_data_classes.QualityOutput.doc_length"]], "doc_length (textdescriptives.components.quality_data_classes.qualitythresholds attribute)": [[10, "textdescriptives.components.quality_data_classes.QualityThresholds.doc_length"]], "duplicate_line_chr_fraction (textdescriptives.components.quality_data_classes.qualityoutput attribute)": [[10, "textdescriptives.components.quality_data_classes.QualityOutput.duplicate_line_chr_fraction"]], "duplicate_line_chr_fraction (textdescriptives.components.quality_data_classes.qualitythresholds attribute)": [[10, "textdescriptives.components.quality_data_classes.QualityThresholds.duplicate_line_chr_fraction"]], "duplicate_ngram_chr_fraction (textdescriptives.components.quality_data_classes.qualityoutput attribute)": [[10, "textdescriptives.components.quality_data_classes.QualityOutput.duplicate_ngram_chr_fraction"]], "duplicate_ngram_chr_fraction (textdescriptives.components.quality_data_classes.qualitythresholds attribute)": [[10, "textdescriptives.components.quality_data_classes.QualityThresholds.duplicate_ngram_chr_fraction"]], "duplicate_paragraph_chr_fraction (textdescriptives.components.quality_data_classes.qualityoutput attribute)": [[10, "textdescriptives.components.quality_data_classes.QualityOutput.duplicate_paragraph_chr_fraction"]], "duplicate_paragraph_chr_fraction (textdescriptives.components.quality_data_classes.qualitythresholds attribute)": [[10, "textdescriptives.components.quality_data_classes.QualityThresholds.duplicate_paragraph_chr_fraction"]], "mean_word_length (textdescriptives.components.quality_data_classes.qualityoutput attribute)": [[10, "textdescriptives.components.quality_data_classes.QualityOutput.mean_word_length"]], "mean_word_length (textdescriptives.components.quality_data_classes.qualitythresholds attribute)": [[10, "textdescriptives.components.quality_data_classes.QualityThresholds.mean_word_length"]], "n_stop_words (textdescriptives.components.quality_data_classes.qualityoutput attribute)": [[10, "textdescriptives.components.quality_data_classes.QualityOutput.n_stop_words"]], "n_stop_words (textdescriptives.components.quality_data_classes.qualitythresholds attribute)": [[10, "textdescriptives.components.quality_data_classes.QualityThresholds.n_stop_words"]], "oov_ratio (textdescriptives.components.quality_data_classes.qualityoutput attribute)": [[10, "textdescriptives.components.quality_data_classes.QualityOutput.oov_ratio"]], "oov_ratio (textdescriptives.components.quality_data_classes.qualitythresholds attribute)": [[10, "textdescriptives.components.quality_data_classes.QualityThresholds.oov_ratio"]], "passed (textdescriptives.components.quality_data_classes.qualityoutput property)": [[10, "textdescriptives.components.quality_data_classes.QualityOutput.passed"]], "passed (textdescriptives.components.quality_data_classes.thresholdsoutput property)": [[10, "textdescriptives.components.quality_data_classes.ThresholdsOutput.passed"]], "proportion_bullet_points (textdescriptives.components.quality_data_classes.qualityoutput attribute)": [[10, "textdescriptives.components.quality_data_classes.QualityOutput.proportion_bullet_points"]], "proportion_bullet_points (textdescriptives.components.quality_data_classes.qualitythresholds attribute)": [[10, "textdescriptives.components.quality_data_classes.QualityThresholds.proportion_bullet_points"]], "proportion_ellipsis (textdescriptives.components.quality_data_classes.qualityoutput attribute)": [[10, "textdescriptives.components.quality_data_classes.QualityOutput.proportion_ellipsis"]], "proportion_ellipsis (textdescriptives.components.quality_data_classes.qualitythresholds attribute)": [[10, "textdescriptives.components.quality_data_classes.QualityThresholds.proportion_ellipsis"]], "symbol_to_word_ratio (textdescriptives.components.quality_data_classes.qualityoutput attribute)": [[10, "textdescriptives.components.quality_data_classes.QualityOutput.symbol_to_word_ratio"]], "symbol_to_word_ratio (textdescriptives.components.quality_data_classes.qualitythresholds attribute)": [[10, "textdescriptives.components.quality_data_classes.QualityThresholds.symbol_to_word_ratio"]], "threshold (textdescriptives.components.quality_data_classes.thresholdsoutput attribute)": [[10, "textdescriptives.components.quality_data_classes.ThresholdsOutput.threshold"]], "to_flat_value_dict() (textdescriptives.components.quality_data_classes.qualityoutput method)": [[10, "textdescriptives.components.quality_data_classes.QualityOutput.to_flat_value_dict"]], "top_ngram_chr_fraction (textdescriptives.components.quality_data_classes.qualityoutput attribute)": [[10, "textdescriptives.components.quality_data_classes.QualityOutput.top_ngram_chr_fraction"]], "top_ngram_chr_fraction (textdescriptives.components.quality_data_classes.qualitythresholds attribute)": [[10, "textdescriptives.components.quality_data_classes.QualityThresholds.top_ngram_chr_fraction"]], "value (textdescriptives.components.quality_data_classes.thresholdsoutput attribute)": [[10, "textdescriptives.components.quality_data_classes.ThresholdsOutput.value"]], "create_readability_component() (in module textdescriptives.components.readability)": [[11, "textdescriptives.components.readability.create_readability_component"]]}}) \ No newline at end of file +Search.setIndex({"docnames": ["coherence", "dependencydistance", "descriptivestats", "extractors", "faq", "index", "information_theory", "installation", "news", "posstats", "quality", "readability", "tutorial", "tutorials/filter_corpus_using_quality", "tutorials/introductory_tutorial", "tutorials/sklearn_integration", "usingthepackage"], "filenames": ["coherence.rst", "dependencydistance.rst", "descriptivestats.rst", "extractors.rst", "faq.rst", "index.rst", "information_theory.rst", "installation.rst", "news.rst", "posstats.rst", "quality.rst", "readability.rst", "tutorial.rst", "tutorials/filter_corpus_using_quality.ipynb", "tutorials/introductory_tutorial.ipynb", "tutorials/sklearn_integration.ipynb", "usingthepackage.rst"], "titles": ["Coherence", "Dependency Distance", "Descriptive Statistics", "Extractor", "Frequently Asked Questions", "TextDescriptives", "Information Theory", "Installation", "News and Changelog", "Part-of-Speech Proportions", "Quality", "Readability", "Tutorials", "Filtering corpora using Quality", "Introductory Tutorial", "Scikit-learn Integration", "Quick Start"], "terms": {"The": [0, 1, 2, 3, 5, 6, 9, 10, 11, 15, 16], "calcul": [0, 1, 3, 5, 6, 10, 11, 14, 16], "document": [0, 2, 3, 5, 6, 8, 9, 10, 11, 13, 14, 16], "base": [0, 3, 6, 11, 13, 16], "word": [0, 6, 10, 11, 13, 16], "embed": 0, "cosin": 0, "similar": [0, 4, 13], "between": [0, 10, 13, 14], "sentenc": [0, 1, 2, 6, 9, 11, 13, 16], "textdescript": [0, 1, 2, 3, 6, 7, 8, 9, 10, 11, 13, 15, 16], "current": [0, 7], "implement": [0, 1, 8, 10, 11, 13, 15], "first": [0, 11, 13, 16], "order": [0, 12, 16], "second": [0, 16], "follow": [0, 1, 2, 4, 5, 6, 7, 9, 10, 11, 13, 16], "e": [0, 4, 6, 7, 8, 10, 13, 14, 15, 16], "g": [0, 4, 7, 8, 10, 13, 14, 15, 16], "1": [0, 1, 2, 6, 8, 10, 11, 13, 14, 15, 16], "2": [0, 10, 11, 13, 14, 15], "consecut": 0, "ar": [0, 1, 2, 3, 4, 5, 9, 10, 11, 13, 14, 15, 16], "two": [0, 5], "apart": 0, "us": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 15], "spaci": [0, 1, 2, 3, 5, 6, 9, 10, 11, 13], "": [0, 5, 9, 10, 11, 13, 14, 15, 16], "span": [0, 1, 2, 6, 9, 10, 16], "method": [0, 10, 15], "i": [0, 1, 2, 3, 5, 6, 8, 9, 10, 11, 12, 14, 15, 16], "pipelin": [0, 1, 3, 5, 6, 9, 10, 11, 13, 14, 15, 16], "small": [0, 10, 13, 14], "medium": [0, 10], "larg": [0, 5, 8, 10, 13, 14], "transform": [0, 10], "model": [0, 3, 6, 10, 13, 14, 15, 16], "have": [0, 8, 10, 13, 14, 16], "differ": [0, 13, 14], "result": 0, "If": [0, 3, 4, 5, 9, 10, 11, 13, 14, 16], "you": [0, 4, 5, 7, 10, 12, 13, 14, 15, 16], "want": [0, 4, 10, 13, 14, 15, 16], "specif": [0, 4, 7, 13], "fasttext": 0, "should": [0, 4, 8, 10, 13], "overwrit": [0, 10, 13], "doc": [0, 1, 2, 3, 4, 6, 7, 9, 10, 11, 13, 14, 16], "vector": 0, "attribut": [0, 1, 2, 6, 9, 10, 11, 13], "read": [0, 11, 13, 16], "more": [0, 1, 3, 6, 10, 11, 13, 14, 16], "ad": [0, 1, 2, 6, 8, 9, 10, 11], "object": [0, 1, 2, 3, 6, 9, 10, 11, 14, 16], "_": [0, 1, 2, 6, 9, 10, 11, 13, 16], "first_order_coherence_valu": 0, "A": [0, 3, 5, 6, 10, 11, 13], "list": [0, 3, 10, 12, 13, 15], "float": [0, 10, 16], "where": [0, 6, 11], "each": [0, 1, 3, 5, 6, 7, 9, 12, 13, 16], "second_order_coherence_valu": 0, "coherer": 0, "dict": [0, 1, 2, 3, 9, 10, 11, 16], "contain": [0, 3, 5, 10, 13, 14, 15, 16], "mean": [0, 1, 2, 10, 13, 15, 16], "valu": [0, 6, 10, 11, 13, 15, 16], "kei": [0, 6, 9, 14], "first_order_coher": 0, "second_order_coher": 0, "bedi": 0, "carrillo": 0, "f": [0, 10, 13], "cecchi": 0, "slezak": 0, "d": 0, "sigman": 0, "m": [0, 4, 10, 14], "mota": 0, "n": [0, 6, 10, 13], "b": 0, "ribeiro": 0, "javitt": 0, "c": [0, 4, 10, 14, 15], "copelli": 0, "corcoran": 0, "2015": 0, "autom": [0, 11, 16], "analysi": [0, 10, 15], "free": [0, 14, 15], "speech": [0, 13, 16], "predict": [0, 6], "psychosi": 0, "onset": 0, "high": [0, 11, 14], "risk": 0, "youth": 0, "npj": 0, "schizophrenia": 0, "articl": 0, "http": [0, 7, 11, 13], "doi": 0, "org": [0, 11], "10": [0, 10, 13, 14, 15], "1038": 0, "npjschz": 0, "30": [0, 10, 11], "parola": 0, "lin": 0, "j": [0, 10], "simonsen": 0, "blikst": 0, "v": [0, 5], "zhou": 0, "y": [0, 14], "wang": 0, "h": [0, 6], "inou": 0, "l": [0, 11], "koelkebeck": 0, "k": [0, 10], "fusaroli": 0, "r": [0, 4, 7], "2022": [0, 8], "disturb": 0, "assess": 0, "cross": 0, "linguist": 0, "generaliz": 0, "nlp": [0, 1, 2, 3, 6, 8, 9, 10, 11, 13, 14, 16], "measur": [0, 1, 6, 11, 16], "research": [0, 5], "1016": 0, "schre": 0, "07": 0, "002": 0, "import": [0, 1, 2, 6, 9, 10, 11, 13, 14, 15, 16], "td": [0, 1, 2, 6, 9, 10, 11, 13, 14, 16], "load": [0, 1, 2, 6, 8, 9, 10, 11, 13, 14, 15, 16], "en_core_web_lg": [0, 6, 16], "add_pip": [0, 1, 2, 6, 8, 9, 10, 11, 13, 14, 16], "world": [0, 1, 2, 9, 10, 11, 16], "chang": [0, 1, 2, 8, 9, 10, 11, 16], "feel": [0, 1, 2, 9, 10, 11, 16], "water": [0, 1, 2, 9, 10, 11, 16], "earth": [0, 1, 2, 9, 10, 11, 16], "smell": [0, 1, 2, 9, 10, 11, 16], "air": [0, 1, 2, 9, 10, 11, 16], "much": [0, 1, 2, 9, 10, 11, 16], "onc": [0, 1, 2, 8, 9, 10, 11, 16], "wa": [0, 1, 2, 9, 10, 11, 13, 16], "lost": [0, 1, 2, 9, 10, 11, 16], "none": [0, 1, 2, 3, 9, 10, 11, 13, 16], "now": [0, 1, 2, 8, 9, 10, 11, 14, 15, 16], "live": [0, 1, 2, 9, 10, 11, 14, 15, 16], "who": [0, 1, 2, 9, 10, 11, 16], "rememb": [0, 1, 2, 9, 10, 11, 16], "all": [0, 1, 2, 3, 4, 6, 8, 9, 10, 11, 13, 14, 16], "store": [0, 1, 2, 9, 10, 11], "also": [0, 4, 6, 10, 12, 13, 16], "separ": 0, "note": [0, 4, 10, 11], "do": [0, 10, 13, 14], "ani": [0, 3, 10, 11, 14], "respect": 0, "thei": [0, 10, 13, 16], "requir": [0, 4, 6, 7, 10, 11, 13, 15], "averag": [0, 1, 10, 11], "over": [0, 1, 11], "extract": [0, 1, 2, 3, 6, 9, 10, 11, 15, 16], "datafram": [0, 1, 2, 3, 6, 9, 10, 11, 14, 15, 16], "extract_df": [0, 1, 2, 3, 6, 9, 10, 11, 14, 16], "text": [0, 1, 2, 3, 5, 6, 8, 9, 10, 11, 14, 15, 16], "0": [0, 1, 2, 6, 8, 9, 10, 11, 13, 14, 15], "780735": 0, "749475": 0, "create_coherence_compon": 0, "languag": [0, 1, 2, 3, 6, 9, 10, 11, 14], "name": [0, 1, 2, 6, 9, 10, 11, 14, 15], "str": [0, 1, 2, 3, 6, 9, 10, 11], "callabl": [0, 1, 2, 9, 10, 11], "sourc": [0, 1, 2, 3, 6, 9, 10, 11, 13], "allow": [0, 1, 2, 6, 9, 10, 11, 13, 15], "pipe": [0, 1, 2, 3, 8, 9, 10, 11, 13, 15, 16], "thi": [0, 1, 2, 6, 8, 9, 10, 11, 13, 14, 15, 16], "set": [0, 1, 3, 6, 8, 9, 10, 11, 13, 14, 15, 16], "paramet": [0, 1, 2, 3, 6, 9, 10, 11, 14, 16], "doe": [0, 1, 2, 9, 10, 11, 13], "need": [0, 1, 2, 9, 10, 11, 13, 15], "specifi": [0, 1, 2, 4, 9, 10, 11, 14, 16], "call": [0, 1, 2, 3, 9, 10, 11], "can": [0, 1, 2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16], "option": [0, 1, 2, 3, 7, 9, 10, 11, 13, 14, 15], "argument": [0, 1, 2, 9, 10, 11, 13, 15, 16], "return": [0, 1, 2, 3, 9, 10, 11, 13], "type": [0, 1, 2, 3, 5, 9, 10, 11, 13], "exampl": [0, 1, 2, 6, 8, 9, 10, 11, 13, 15], "en_core_web_md": 0, "appli": [0, 1, 2, 9, 11, 13], "anoth": [0, 2], "get": [0, 5, 7, 12, 13, 14, 15], "dependency_dist": [1, 3, 14, 16], "add": [1, 2, 3, 9, 10, 11, 13, 16], "both": 1, "token": [1, 2, 6, 9, 10, 13, 14, 16], "under": [1, 10, 11], "syntact": 1, "complex": [1, 6, 11], "greater": 1, "liu": [1, 10], "2008": 1, "oya": 1, "2011": 1, "we": [1, 10, 11, 12, 13, 14, 15], "from": [1, 3, 4, 5, 6, 10, 11, 13, 14, 15], "take": [1, 13], "level": [1, 11], "pleas": [1, 4, 5, 13], "see": [1, 8, 13, 14, 16], "issu": [1, 4, 5], "how": [1, 5, 6, 11, 13, 14, 15], "metric": [1, 3, 5, 6, 8, 10, 11, 13, 15, 16], "propos": [1, 10], "For": [1, 10, 13, 14, 16], "standard": [1, 13, 14, 16], "deviat": [1, 14, 16], "along": 1, "proport": [1, 2, 8, 10, 11, 13, 16], "adjac": [1, 16], "relat": [1, 5, 8, 16], "whether": [1, 3, 10, 16], "an": [1, 3, 4, 6, 10, 11, 13, 14, 15, 16], "en_core_web_sm": [1, 2, 9, 10, 11, 14, 16], "access": [1, 15, 16], "same": [1, 15], "wai": 1, "3": [1, 2, 4, 5, 10, 11, 13, 14, 15], "dependency_distance_mean": [1, 14], "dependency_distance_std": [1, 14], "prop_adjacent_dependency_relation_mean": [1, 14], "prop_adjacent_dependency_relation_std": [1, 14], "77524": 1, "553188": 1, "457143": 1, "0722806": 1, "create_dependency_distance_compon": 1, "creat": [1, 4, 9, 10, 13, 16], "factori": 1, "dependencydist": [1, 14], "descriptive_stat": [2, 3, 8, 11, 15, 16], "number": [2, 6, 10, 11, 14, 16], "count": [2, 13, 14, 16], "uniqu": [2, 14, 16], "charact": [2, 10, 11, 13, 14, 16], "sentence_length": [2, 16], "length": [2, 6, 10, 11, 13, 14, 16], "median": [2, 15, 16], "std": [2, 16], "syllabl": [2, 11, 16], "per": [2, 6, 11, 16], "token_length": [2, 16], "singl": [2, 3], "individu": [2, 16], "token_length_mean": [2, 11, 14, 15], "token_length_median": [2, 11, 15], "token_length_std": [2, 11, 14, 15], "sentence_length_mean": [2, 11, 15], "sentence_length_median": [2, 11, 14, 15], "sentence_length_std": [2, 11, 14, 15], "syllables_per_token_mean": [2, 11, 14, 15], "syllables_per_token_median": [2, 11, 14, 15], "syllables_per_token_std": [2, 11, 14, 15], "n_token": [2, 11, 14, 15], "n_unique_token": [2, 11, 14, 15], "proportion_unique_token": [2, 11, 14, 15], "n_charact": [2, 11, 14, 15], "n_sentenc": [2, 11, 14, 15], "28571": [2, 11], "54127": [2, 11], "7": [2, 4, 8, 10, 11, 13, 15], "6": [2, 10, 11, 13, 14, 15, 16], "09839": [2, 11], "08571": [2, 11], "368117": [2, 11], "35": [2, 11], "23": [2, 11], "657143": [2, 11], "121": [2, 11], "5": [2, 8, 10, 11, 13, 14, 15], "create_descriptive_stats_compon": 2, "verbos": [2, 11], "bool": [2, 3, 9, 10, 11], "descriptivestatist": [2, 11], "blank": [2, 6, 10, 11, 13, 16], "en": [2, 11, 13, 14, 15, 16], "stat": 2, "featur": [3, 5, 15], "extract_metr": [3, 15, 16], "meant": 3, "raw": 3, "wherea": 3, "extract_dict": [3, 16], "work": [3, 11, 13, 16], "union": [3, 10], "lang": [3, 14, 15, 16], "iter": 3, "spacy_model": [3, 14, 16], "spacy_model_s": [3, 14], "lg": 3, "panda": [3, 10, 15, 16], "provid": [3, 15], "automat": [3, 14, 16], "download": [3, 12, 13, 14, 16], "default": [3, 8, 9, 10, 11, 13, 16], "which": [3, 4, 6, 8, 10, 12, 13, 14, 15, 16], "One": 3, "readabl": [3, 5, 14, 16], "pos_proport": [3, 8, 9, 16], "coher": [3, 6, 16], "qualiti": [3, 8, 12, 16], "information_theori": [3, 6, 16], "one": [3, 6, 9, 10, 13, 16], "size": [3, 14], "row": [3, 14, 16], "column": [3, 14, 15, 16], "pd": [3, 15], "include_text": [3, 14, 16], "true": [3, 9, 10, 11, 13, 14], "gener": [3, 5, 13], "case": [3, 13], "compoen": 3, "ha": [3, 8], "been": [3, 8, 10], "dictionari": [3, 6, 10, 16], "entri": [3, 14, 15], "come": 4, "ensur": 4, "function": [4, 5, 10, 14, 15, 16], "To": [4, 5, 7, 12, 13, 14, 15, 16], "clone": [4, 7], "repositori": [4, 7, 13], "instal": [4, 5, 13, 14, 15], "pip": [4, 7, 13, 14, 15], "style": [4, 7], "txt": [4, 7], "depend": [4, 5, 7, 14, 16], "well": [4, 6, 13], "lint": [4, 7], "These": [4, 6, 10, 13], "pyproject": 4, "toml": 4, "file": 4, "pytest": 4, "folder": 4, "python": [4, 5, 14], "desired_test": 4, "py": [4, 13], "check": [4, 10, 13, 16], "coverag": 4, "cov": 4, "intend": 4, "major": [4, 13], "o": 4, "includ": [4, 6, 7, 10, 14, 16], "window": 4, "latest": 4, "version": [4, 13], "maco": 4, "linux": 4, "ubuntu": 4, "similarli": [4, 6], "8": [4, 10, 11, 13, 15], "9": [4, 10, 13, 14, 15], "onli": [4, 10, 13, 14, 15, 16], "system": 4, "being": 4, "activ": 4, "earlier": 4, "like": [4, 6, 13], "sphinx": 4, "It": [4, 6, 10, 11, 13], "furo": 4, "theme": 4, "custom": [4, 13], "make": [4, 14, 15], "extens": [4, 5, 10, 13], "html": 4, "librari": [5, 16], "varieti": [5, 13, 14], "statist": [5, 14, 16], "compon": [5, 8, 13, 14, 15], "sever": 5, "descript": [5, 16], "distanc": [5, 14, 16], "wish": [5, 10, 13], "try": [5, 13, 14, 15], "out": [5, 10], "packag": [5, 7, 12, 13, 14, 15, 16], "onlin": 5, "demo": 5, "organ": 5, "part": [5, 10, 13, 14, 16], "start": [5, 7, 10, 12, 13, 14], "instruct": 5, "guid": 5, "tutori": [5, 7, 15], "refer": [5, 10], "public": 5, "class": [5, 13], "report": 5, "request": 5, "github": [5, 7], "tracker": 5, "kindli": 5, "forum": [5, 13], "otherwis": [5, 16], "discuss": 5, "bug": 5, "idea": 5, "usag": 5, "your": [5, 7, 13, 14, 16], "cite": 5, "preprint": [5, 10], "index": [5, 11, 14, 16], "page": 5, "theoret": 6, "deriv": 6, "entropi": [6, 16], "shannon": 6, "prob": 6, "probabl": [6, 13], "defin": 6, "x": [6, 13, 14], "sum_": 6, "p": [6, 10], "x_i": 6, "log_": 6, "perplex": [6, 16], "distribut": [6, 13, 14], "sampl": [6, 13], "ppl": 6, "per_word_perplex": [6, 16], "divid": [6, 15], "se": 6, "consid": [6, 10], "normal": 6, "often": [6, 13], "describ": 6, "higher": [6, 11, 13], "could": [6, 10, 13, 14], "imagin": 6, "filter": [6, 8, 10, 12], "given": 6, "assumpt": 6, "highli": 6, "surpris": 6, "fact": 6, "non": 6, "piec": [6, 11], "avail": [6, 13, 14, 15], "lexem": 6, "prop": 6, "tabl": [6, 16], "warn": [6, 11, 13], "rais": [6, 11, 13], "np": [6, 11], "nan": [6, 11, 14], "cannot": 6, "found": [6, 14], "simpl": [6, 9, 11, 13, 14], "veri": [6, 13], "288195": 6, "334017": 6, "190574": 6, "create_information_theory_compon": 6, "informationtheori": 6, "command": [6, 7, 13], "run": [7, 10, 12, 13, 14, 16], "line": [7, 10], "termin": 7, "dependendic": 7, "build": [7, 14], "test": [7, 9, 10, 11, 13, 14, 15], "git": 7, "com": 7, "hlass": 7, "link": 7, "t": [7, 10, 13, 14, 15, 16], "extra": [7, 10], "v2": 8, "1st": 8, "januari": 8, "2023": 8, "renam": 8, "prefix": [8, 15, 16], "pos_stat": 8, "consist": [8, 13], "v1": 8, "21st": 8, "septemb": 8, "seri": [8, 13], "some": [8, 10, 13, 14, 16], "were": [8, 10, 13, 16], "rae": [8, 10], "et": [8, 10], "al": [8, 10], "2021": [8, 10], "raffel": [8, 10], "2020": [8, 10], "corpora": [8, 10, 12], "4th": 8, "mai": [8, 13], "minor": 8, "fix": 8, "bell": 8, "whistl": 8, "octob": 8, "po": [8, 9], "pos_": 8, "instead": [8, 10, 11, 13, 14], "tag_": 8, "behavior": 8, "use_tag": 8, "fals": [8, 9, 10, 13, 14, 15, 16], "when": [8, 13], "initialis": 8, "modul": [8, 11], "pos_prop_postag": 9, "tag": [9, 16], "postag": 9, "By": [9, 16], "possibl": 9, "behaviour": [9, 16], "turn": 9, "off": 9, "add_all_tag": 9, "initi": [9, 14], "pos_prop_adj": 9, "pos_prop_adp": 9, "pos_prop_adv": 9, "pos_prop_aux": 9, "pos_prop_cconj": 9, "pos_prop_det": 9, "pos_prop_intj": 9, "pos_prop_noun": 9, "pos_prop_num": 9, "pos_prop_part": 9, "pos_prop_pron": 9, "pos_prop_propn": 9, "pos_prop_punct": 9, "pos_prop_sconj": 9, "pos_prop_sym": 9, "pos_prop_verb": 9, "pos_prop_x": 9, "0243902": 9, "097561": 9, "0487805": 9, "0731707": 9, "121951": 9, "195122": 9, "146341": 9, "170732": 9, "create_pos_proportions_compon": 9, "use_po": 9, "pospropot": 9, "detail": [9, 16], "posproport": 9, "heurist": [10, 13], "stop": [10, 13], "n_stop_word": [10, 13], "alpha": [10, 13], "ratio": [10, 13], "alpha_ratio": [10, 13], "least": [10, 13], "alphabet": [10, 13], "mean_word_length": [10, 13], "ellipsi": 10, "proportion_ellipsi": [10, 13], "end": [10, 15], "bullet": 10, "point": [10, 14, 15], "proportion_bullet_point": [10, 13], "symbol": [10, 13], "symbol_": 10, "_2_word_ratio": 10, "hashtag": [10, 13], "curli": 10, "bracket": 10, "string": [10, 13], "contains_": 10, "instanc": [10, 13], "lorem": [10, 13], "ipsum": [10, 13], "vocabulari": 10, "oov_ratio": [10, 13], "total": [10, 11, 13], "repetiti": 10, "duplic": [10, 13], "fraction": [10, 13], "duplicate_lines_chr_fract": 10, "within": 10, "paragraph": [10, 13], "duplicate_paragraphs_chr_fract": 10, "gram": [10, 13], "duplicate_": 10, "_gram_chr_fract": 10, "rang": [10, 13], "top": [10, 14], "top_": 10, "pre": 10, "train": [10, 13, 15], "optim": 10, "speed": 10, "rather": 10, "usabl": 10, "simplic": 10, "integr": [10, 12, 16], "corpu": [10, 13, 14], "danish": [10, 13], "foundat": 10, "other": [10, 13, 16], "dedupl": 10, "strategi": [10, 15], "pass": [10, 13, 14, 15, 16], "passed_quality_check": [10, 13, 16], "doc_length": [10, 13], "duplicate_line_chr_fract": [10, 13], "duplicate_paragraph_chr_fract": [10, 13], "duplicate_5": 10, "gram_chr_fract": [10, 13], "duplicate_6": 10, "duplicate_7": 10, "duplicate_8": 10, "duplicate_9": 10, "duplicate_10": [10, 13], "top_2": 10, "top_3": 10, "top_4": 10, "contains_lorem": 10, "24": [10, 14], "853659": 10, "95122": 10, "41": 10, "232258": 10, "0580645": 10, "174194": 10, "threshold": [10, 13], "so": [10, 13, 14, 15], "qualitythreshold": [10, 13], "just": [10, 11, 13], "upper": [10, 13], "bound": [10, 13], "100000": [10, 13], "symbol_to_word_ratio": [10, 13], "duplicate_ngram_chr_fract": [10, 13], "15": [10, 11, 13], "14": [10, 13, 14], "13": [10, 13, 14, 15], "12": [10, 11, 13, 14, 15], "11": [10, 11, 13, 14, 15], "top_ngram_chr_fract": [10, 13], "18": [10, 11, 13], "4": [10, 11, 13, 14, 15], "16": [10, 13], "quality_pip": [10, 13], "set_quality_threshold": [10, 13], "updat": [10, 13], "create_quality_compon": 10, "top_ngram_rang": 10, "tupl": 10, "int": 10, "top_ngram_min_count": 10, "duplicate_n_gram_fraction_rang": 10, "vocab": 10, "map": 10, "forc": 10, "overwritten": 10, "split": [10, 13, 15], "rn": 10, "low": [10, 13], "w": 10, "borgeaud": 10, "cai": 10, "millican": 10, "hoffmann": 10, "song": 10, "irv": 10, "scale": 10, "insight": 10, "gopher": 10, "arxiv": 10, "2112": 10, "11446": 10, "shazeer": 10, "robert": 10, "lee": 10, "narang": 10, "matena": 10, "explor": [10, 13], "limit": 10, "transfer": 10, "learn": [10, 12], "unifi": 10, "mach": 10, "re": [10, 16], "21": [10, 11], "140": 10, "67": 10, "them": [10, 14, 15, 16], "minimum": 10, "time": [10, 13, 14, 15], "must": [10, 13], "occur": 10, "exist": [10, 13], "spacy_qu": 10, "pydant": 10, "quality_data_class": 10, "config": 10, "forbid": 10, "field": 10, "70": 10, "lower": [10, 11, 13], "origin": [10, 11, 14], "account": 10, "adiffer": 10, "definit": 10, "boundari": 10, "punctuat": 10, "isnot": 10, "100_000": [10, 13], "most": [10, 13], "20": [10, 14], "80": [10, 11], "symboloccurr": 10, "occurr": 10, "100": [10, 11, 13], "appear": [10, 13], "than": [10, 11, 13], "qualityoutput": [10, 13], "output": [10, 13, 15, 16], "thresholdsoutput": [10, 13], "presenc": 10, "to_flat_value_dict": 10, "flat": 10, "represent": 10, "easi": [10, 11, 13], "convers": 10, "properti": 10, "three": [10, 11, 13], "item": 10, "either": 10, "interv": 10, "accept": 10, "boolean": [10, 14, 16], "t_out": 10, "readabiltii": [11, 16], "hyphen": 11, "pyphen": 11, "support": 11, "gun": [11, 16], "fog": [11, 16], "wikipedia": 11, "wiki": 11, "gunning_fog_index": 11, "__": 11, "develop": 11, "english": [11, 14], "write": 11, "estim": 11, "year": 11, "formal": 11, "educ": 11, "understand": 11, "u": [11, 13, 14, 15], "school": 11, "senior": 11, "around": [11, 13], "old": 11, "formula": 11, "grade": [11, 16], "asl": 11, "phw": 11, "percentag": 11, "hard": 11, "smog": [11, 14, 16], "gobbledygook": 11, "primarili": 11, "focus": 11, "polysyllab": 11, "043": 11, "1291": 11, "flesch": [11, 16], "eas": [11, 16], "e2": 11, "93kincaid_readability_test": 11, "flesch_reading_eas": [11, 14], "score": [11, 13, 15, 16], "indic": [11, 16], "easier": [11, 15], "while": [11, 13], "difficult": 11, "206": 11, "835": 11, "015": 11, "84": 11, "asw": 11, "kincaid": [11, 16], "93kincaid_grade_level": 11, "comprehend": 11, "39": [11, 14], "59": 11, "automated_readability_index": [11, 14], "approxim": 11, "ari": 11, "71": 11, "n_char": 11, "n_word": 11, "43": 11, "coleman": [11, 16], "liau": [11, 16], "93liau_index": 11, "___": 11, "letter": 11, "cli": 11, "0588": 11, "296": 11, "In": [11, 13, 14, 15], "our": [11, 13, 14], "entir": 11, "lix": [11, 14, 16], "lix_": 11, "readability_test": 11, "lesbarhetsindex": 11, "long": [11, 13], "six": 11, "n_long_word": 11, "rix": [11, 14, 16], "www": 11, "jstor": 11, "stabl": 11, "40031755": 11, "difficulti": 11, "flesch_kincaid_grad": [11, 14], "gunning_fog": [11, 14], "coleman_liau_index": [11, 14], "107": 11, "879": 11, "0485714": 11, "68392": 11, "94286": 11, "45429": 11, "708571": 11, "7143": 11, "create_readability_compon": 11, "alreadi": [11, 14, 15], "toggl": 11, "show": [11, 13, 16], "messag": [11, 13, 14, 15], "recommend": 12, "go": [12, 14, 15], "through": [12, 13, 14, 15], "below": [12, 13, 16], "jupyt": 12, "notebook": 12, "local": [12, 13], "introductori": [12, 15], "scikit": 12, "mani": 13, "analys": [13, 14], "tweet": 13, "scrape": 13, "remov": [13, 15], "huggingfac": 13, "thu": 13, "Or": 13, "except": [13, 14, 15], "mc4": 13, "would": [13, 15], "ag": 13, "whole": [13, 16], "stream": 13, "down": 13, "1000": 13, "load_dataset": 13, "000": 13, "home": 13, "runner": 13, "lib": 13, "python3": 13, "site": 13, "1429": 13, "futurewarn": 13, "code": [13, 14], "execut": 13, "correctli": 13, "inspect": 13, "hf": 13, "co": 13, "avoid": 13, "futur": 13, "trust_remote_cod": 13, "mandatori": 13, "next": [13, 14], "releas": 13, "let": [13, 14, 15], "look": 13, "400": 13, "print": [13, 15], "post": 13, "362": 13, "info": 13, "okai": 13, "those": 13, "help": [13, 16], "laugh": 13, "ask": 13, "about": [13, 14], "ohm": 13, "power": 13, "lsi15": 13, "know": 13, "book": 13, "websit": 13, "someon": 13, "talk": 13, "seek": 13, "what": 13, "share": 13, "me": 13, "question": 13, "gain": 13, "audio": 13, "thats": 13, "Not": 13, "up": 13, "segment": 13, "here": 13, "quit": 13, "especi": 13, "simpli": [13, 14, 15], "examin": 13, "seem": 13, "did": 13, "why": 13, "435": 13, "79": 13, "52": 13, "894": 13, "42": [13, 15], "38": 13, "36": 13, "01": 13, "natur": 13, "might": 13, "easili": [13, 14], "realli": 13, "repitit": 13, "sign": 13, "max": 13, "quality_threshold": 13, "typic": 13, "interest": [13, 16], "filtered_text": 13, "len": 13, "process": 13, "572": 13, "lot": [13, 14], "howev": 13, "unreason": 13, "adjust": 13, "new_threshold": 13, "don": [13, 14, 15, 16], "dynam": 13, "new": 13, "tune": 13, "reason": 13, "bit": [13, 14], "further": 13, "section": [13, 16], "lead": 13, "problem": 13, "directli": 13, "gigaword": 13, "purpos": 13, "2500": 13, "collect": [13, 14, 15], "script": 13, "error": 13, "littl": 13, "ddsc": 13, "partial": 13, "ten_sampl": 13, "select": 13, "to_panda": 13, "As": [13, 15], "previous": 13, "mention": 13, "multipl": 13, "retsinformationdk": 13, "legal": 13, "hest": 13, "debat": 13, "nettet": 13, "dk": 13, "spont": 13, "transcrib": 13, "spontan": 13, "lambda": 13, "num_proc": 13, "tv2r": 13, "notabl": 13, "fewer": 13, "after": [13, 15], "prepar": 13, "extrem": 13, "thing": 13, "creas": 13, "da": [13, 16], "max_length": 13, "2000000": 13, "increas": 13, "worth": 13, "beforehand": 13, "legal_doc": 13, "effici": 13, "format": [13, 16], "50": 13, "uncommon": 13, "phrase": 13, "previou": 13, "close": 13, "mostli": 13, "made": 13, "good": 13, "later": 13, "common": 13, "assum": [13, 14], "keep": 13, "legal_docs_filt": 13, "had": 13, "That": [13, 14], "seaborn": [13, 14], "sn": [13, 14], "def": 13, "get_duplicate_10_gram_fract": 13, "duplicate_10_gram_fract": 13, "histplot": 13, "explain": 13, "everyth": 13, "relev": [13, 16], "few": [13, 14], "perculiar": 13, "behav": 13, "news_doc": 13, "speech_doc": 13, "news_alpha_ratio": 13, "speech_alpha_ratio": 13, "plot": [13, 14], "histogram": 13, "matplotlib": [13, 14], "pyplot": [13, 14], "plt": [13, 14], "label": [13, 14, 15], "binwidth": 13, "05": 13, "xlabel": [13, 14], "ylabel": [13, 14], "legend": 13, "coupl": 13, "fair": 13, "amount": 13, "abov": 13, "almost": 13, "suspic": 13, "depth": 13, "dento": 13, "speaker": 13, "alpa": 13, "problemat": 13, "task": 13, "hand": 13, "therefor": 13, "target": 13, "quickli": [14, 16], "spend": 14, "exactli": 14, "care": 14, "introduc": 14, "inform": [14, 15, 16], "ll": [14, 15], "quick": 14, "overview": 14, "sm": [14, 15], "spam": [14, 15], "dataset": [14, 15], "5572": [14, 15], "categor": [14, 15], "ham": [14, 15], "familiar": 14, "util": [14, 15], "load_sms_data": [14, 15], "df": [14, 15, 16], "head": [14, 15, 16], "until": [14, 15], "jurong": [14, 15], "crazi": [14, 15], "ok": [14, 15], "lar": [14, 15], "joke": [14, 15], "wif": [14, 15], "oni": [14, 15], "wkly": [14, 15], "comp": [14, 15], "win": [14, 15], "fa": [14, 15], "cup": [14, 15], "fina": [14, 15], "dun": [14, 15], "sai": [14, 15], "earli": [14, 15], "hor": [14, 15], "nah": [14, 15], "think": [14, 15], "he": [14, 15, 16], "goe": [14, 15], "usf": [14, 15], "aro": [14, 15], "value_count": 14, "4825": 14, "747": 14, "dtype": 14, "int64": 14, "handl": 14, "boilerpl": 14, "haven": 14, "befor": 14, "appropri": [14, 16], "en_core_web_model": 14, "altern": 14, "join": 14, "metrics_df": 14, "drop": [14, 15], "333333": 14, "833333": 14, "388889": 14, "055556": 14, "90": 14, "935000": 14, "060000": 14, "250000": 14, "433013": 14, "000000": 14, "92": 14, "666667": 14, "433333": 14, "233333": 14, "105": 14, "090000": 14, "653333": 14, "166667": 14, "372678": 14, "516807": 14, "659664": 14, "521008": 14, "050420": 14, "917857": 14, "777143": 14, "178571": 14, "382993": 14, "28": 14, "857143": 14, "128": 14, "452381": 14, "119048": 14, "535714": 14, "035714": 14, "116": 14, "652500": 14, "645000": 14, "818182": 14, "066667": 14, "400000": 14, "109": 14, "040000": 14, "280000": 14, "923077": 14, "49": 14, "larger": 14, "yet": 14, "0x7f801fe352a0": 14, "whenev": 14, "spacy_pipe_metrics_df": 14, "left": 14, "assur": 14, "ourselv": 14, "ident": 14, "equal": 14, "With": 14, "sens": 14, "boxplot": 14, "ax": 14, "correl": 14, "strongli": 14, "encod": 14, "is_ham": 14, "comput": 14, "metrics_correl": 14, "corrwith": 14, "sort_valu": [14, 15], "ab": 14, "ascend": [14, 15], "409000": 14, "408325": 14, "363069": 14, "322233": 14, "322176": 14, "303665": 14, "298789": 14, "296832": 14, "294734": 14, "294251": 14, "float64": 14, "pretti": 14, "kde": 14, "fig": 14, "subplot": 14, "figsiz": 14, "sharei": 14, "enumer": 14, "kdeplot": 14, "hue": 14, "cool": 14, "ve": 14, "done": 14, "actual": 14, "step": 14, "continu": 14, "classifi": [14, 15], "extractor": 15, "saw": 15, "exploratori": 15, "data": 15, "walk": 15, "sklearn": 15, "classif": 15, "instanti": 15, "textdecriptivesfeatur": 15, "wrap": 15, "alright": 15, "sklearn_featur": 15, "textdescriptivesfeatur": 15, "descriptive_stats_extractor": 15, "sure": 15, "columntransform": 15, "necessari": [15, 16], "oper": 15, "miss": 15, "simpleimput": 15, "imput": 15, "randomforestclassifi": 15, "evalu": 15, "ensembl": 15, "compos": 15, "model_select": 15, "train_test_split": 15, "set_config": 15, "tell": 15, "transform_output": 15, "text_process": 15, "text_processing__": 15, "verbose_feature_names_out": 15, "x_train": 15, "x_test": 15, "y_train": 15, "y_test": 15, "axi": 15, "test_siz": 15, "random_st": 15, "fit": 15, "accuraci": 15, "9390134529147982": 15, "nice": 15, "get_features_out": 15, "feature_importance_map": 15, "zip": 15, "feature_names_in_": 15, "named_step": 15, "feature_importances_": 15, "sort": 15, "df_import": 15, "175819": 15, "139780": 15, "132489": 15, "110267": 15, "086906": 15, "059072": 15, "055452": 15, "051767": 15, "049400": 15, "038428": 15, "036969": 15, "035417": 15, "026346": 15, "001889": 15, "desir": 16, "auto": 16, "vice": 16, "versa": 16, "syntax": 16, "shorthand": 16, "conveni": 16, "control": 16, "extract_x": 16, "felt": 16, "hi": 16, "life": 16, "kind": 16, "dream": 16, "sometim": 16, "wonder": 16, "whose": 16, "enjoi": 16, "parser": 16, "textdesript": 16, "jeg": 16, "var": 16, "atten": 16, "tog": 16, "patent": 16, "p\u00e5": 16, "ild": 16, "det": 16, "skull": 16, "sener": 16, "vise": 16, "sig": 16, "blive": 16, "meget": 16, "indbringend": 16, "forretn": 16, "spi": 16, "skovsneglen": 16, "mull": 16, "du": 16, "vil": 16, "jo": 16, "gern": 16, "v\u00e6re": 16, "med": 16, "hulen": 16, "ikk": 16, "clever": 16, "enough": 16, "textdecript": 16, "theori": 16}, "objects": {"textdescriptives.components.coherence": [[0, 0, 1, "", "create_coherence_component"]], "textdescriptives.components.dependency_distance": [[1, 0, 1, "", "create_dependency_distance_component"]], "textdescriptives.components.descriptive_stats": [[2, 0, 1, "", "create_descriptive_stats_component"]], "textdescriptives.components.information_theory": [[6, 0, 1, "", "create_information_theory_component"]], "textdescriptives.components.pos_proportions": [[9, 0, 1, "", "create_pos_proportions_component"]], "textdescriptives.components.quality": [[10, 0, 1, "", "create_quality_component"]], "textdescriptives.components.quality_data_classes": [[10, 1, 1, "", "QualityOutput"], [10, 1, 1, "", "QualityThresholds"], [10, 1, 1, "", "ThresholdsOutput"]], "textdescriptives.components.quality_data_classes.QualityOutput": [[10, 2, 1, "", "alpha_ratio"], [10, 2, 1, "", "contains"], [10, 2, 1, "", "doc_length"], [10, 2, 1, "", "duplicate_line_chr_fraction"], [10, 2, 1, "", "duplicate_ngram_chr_fraction"], [10, 2, 1, "", "duplicate_paragraph_chr_fraction"], [10, 2, 1, "", "mean_word_length"], [10, 2, 1, "", "n_stop_words"], [10, 2, 1, "", "oov_ratio"], [10, 3, 1, "", "passed"], [10, 2, 1, "", "proportion_bullet_points"], [10, 2, 1, "", "proportion_ellipsis"], [10, 2, 1, "", "symbol_to_word_ratio"], [10, 4, 1, "", "to_flat_value_dict"], [10, 2, 1, "", "top_ngram_chr_fraction"]], "textdescriptives.components.quality_data_classes.QualityThresholds": [[10, 2, 1, "", "alpha_ratio"], [10, 2, 1, "", "contains"], [10, 2, 1, "", "doc_length"], [10, 2, 1, "", "duplicate_line_chr_fraction"], [10, 2, 1, "", "duplicate_ngram_chr_fraction"], [10, 2, 1, "", "duplicate_paragraph_chr_fraction"], [10, 2, 1, "", "mean_word_length"], [10, 2, 1, "", "n_stop_words"], [10, 2, 1, "", "oov_ratio"], [10, 2, 1, "", "proportion_bullet_points"], [10, 2, 1, "", "proportion_ellipsis"], [10, 2, 1, "", "symbol_to_word_ratio"], [10, 2, 1, "", "top_ngram_chr_fraction"]], "textdescriptives.components.quality_data_classes.ThresholdsOutput": [[10, 3, 1, "", "passed"], [10, 2, 1, "", "threshold"], [10, 2, 1, "", "value"]], "textdescriptives.components.readability": [[11, 0, 1, "", "create_readability_component"]], "textdescriptives.extractors": [[3, 0, 1, "", "extract_df"], [3, 0, 1, "", "extract_dict"], [3, 0, 1, "", "extract_metrics"]]}, "objtypes": {"0": "py:function", "1": "py:pydantic_model", "2": "py:pydantic_field", "3": "py:property", "4": "py:method"}, "objnames": {"0": ["py", "function", "Python function"], "1": ["py", "pydantic_model", "Python model"], "2": ["py", "pydantic_field", "Python field"], "3": ["py", "property", "Python property"], "4": ["py", "method", "Python method"]}, "titleterms": {"coher": 0, "usag": [0, 1, 2, 6, 9, 10, 11, 16], "compon": [0, 1, 2, 6, 9, 10, 11, 16], "depend": 1, "distanc": 1, "descript": 2, "statist": 2, "extractor": 3, "api": 3, "frequent": 4, "ask": [4, 5], "question": [4, 5], "how": 4, "do": 4, "i": [4, 13], "test": 4, "code": 4, "run": 4, "suit": 4, "doe": 4, "thi": 4, "packag": 4, "x": 4, "document": 4, "gener": 4, "textdescript": [5, 14], "where": 5, "citat": 5, "indic": 5, "search": 5, "inform": 6, "theori": 6, "instal": 7, "develop": 7, "new": 8, "changelog": 8, "part": 9, "speech": 9, "proport": 9, "qualiti": [10, 13], "data": [10, 13, 14], "class": 10, "readabl": 11, "tutori": [12, 13, 14], "filter": 13, "corpora": 13, "us": [13, 14, 16], "setup": [13, 15], "web": 13, "content": 13, "The": [13, 14], "extract": [13, 14], "high": 13, "text": 13, "chang": 13, "compar": 13, "domain": 13, "note": 13, "dagw": 13, "dataset": 13, "current": 13, "unavail": 13, "due": 13, "copyright": 13, "disput": 13, "remaind": 13, "ha": 13, "been": 13, "disabl": 13, "now": 13, "convert": 13, "markdown": 13, "re": 13, "enabl": 13, "onc": 13, "settl": 13, "out": 13, "across": 13, "introductori": 14, "easi": 14, "wai": 14, "metric": 14, "extract_metr": 14, "configur": 14, "add": 14, "pipe": 14, "spaci": [14, 16], "exploratori": 14, "analysi": 14, "scikit": 15, "learn": 15, "integr": 15, "quick": 16, "start": 16, "specif": 16, "avail": 16, "attribut": 16}, "envversion": {"sphinx.domains.c": 2, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 8, "sphinx.domains.index": 1, "sphinx.domains.javascript": 2, "sphinx.domains.math": 2, "sphinx.domains.python": 3, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "sphinx.ext.viewcode": 1, "sphinx": 57}, "alltitles": {"Coherence": [[0, "coherence"]], "Usage": [[0, "usage"], [1, "usage"], [2, "usage"], [6, "usage"], [9, "usage"], [10, "usage"], [11, "usage"]], "Component": [[0, "component"], [1, "component"], [2, "component"], [6, "component"], [9, "component"], [10, "component"], [11, "component"]], "Dependency Distance": [[1, "dependency-distance"]], "Descriptive Statistics": [[2, "descriptive-statistics"]], "Extractor": [[3, "extractor"]], "API": [[3, "api"]], "Frequently Asked Questions": [[4, "frequently-asked-questions"]], "How do I test the code and run the test suite?": [[4, "how-do-i-test-the-code-and-run-the-test-suite"]], "Does this package run on X?": [[4, "does-this-package-run-on-x"]], "How is the documentation generated?": [[4, "how-is-the-documentation-generated"]], "TextDescriptives": [[5, "textdescriptives"]], "Where to ask questions?": [[5, "where-to-ask-questions"]], "Citation": [[5, "citation"]], "Indices and search": [[5, "indices-and-search"]], "Information Theory": [[6, "information-theory"]], "Installation": [[7, "installation"]], "Development Installation": [[7, "development-installation"]], "News and Changelog": [[8, "news-and-changelog"]], "Part-of-Speech Proportions": [[9, "part-of-speech-proportions"]], "Quality": [[10, "quality"]], "Data Classes": [[10, "data-classes"]], "Readability": [[11, "readability"]], "Tutorials": [[12, "tutorials"], [12, null]], "Filtering corpora using Quality": [[13, "filtering-corpora-using-quality"]], "Setup": [[13, "setup"], [15, "setup"]], "Filtering Web content": [[13, "filtering-web-content"]], "The Data": [[13, "the-data"]], "Filtering": [[13, "filtering"]], "Extracting high quality texts": [[13, "extracting-high-quality-texts"]], "Changing the filters": [[13, "changing-the-filters"]], "Comparing Domains": [[13, "comparing-domains"]], "Data": [[13, "data"]], "NOTE: The DAGW dataset is currently unavailable due to a copyright dispute. The remainder of the tutorial has been disabled for now (converted to markdown), and will be re-enabled once the dispute settles.": [[13, "note-the-dagw-dataset-is-currently-unavailable-due-to-a-copyright-dispute-the-remainder-of-the-tutorial-has-been-disabled-for-now-converted-to-markdown-and-will-be-re-enabled-once-the-dispute-settles"]], "Quality Filtering": [[13, "quality-filtering"]], "Filtering out the text": [[13, "filtering-out-the-text"]], "Comparing across domains": [[13, "comparing-across-domains"]], "Introductory Tutorial": [[14, "introductory-tutorial"]], "Using TextDescriptives": [[14, "using-textdescriptives"]], "The easy way: Extract metrics with extract_metrics": [[14, "the-easy-way-extract-metrics-with-extract-metrics"]], "The configurable way: Add pipes to spaCy": [[14, "the-configurable-way-add-pipes-to-spacy"]], "Exploratory Data Analysis": [[14, "exploratory-data-analysis"]], "Scikit-learn Integration": [[15, "scikit-learn-integration"]], "Quick Start": [[16, "quick-start"]], "Usage with spaCy": [[16, "usage-with-spacy"]], "Using Specific Components": [[16, "using-specific-components"]], "Available Attributes": [[16, "available-attributes"]]}, "indexentries": {"create_coherence_component() (in module textdescriptives.components.coherence)": [[0, "textdescriptives.components.coherence.create_coherence_component"]], "create_dependency_distance_component() (in module textdescriptives.components.dependency_distance)": [[1, "textdescriptives.components.dependency_distance.create_dependency_distance_component"]], "create_descriptive_stats_component() (in module textdescriptives.components.descriptive_stats)": [[2, "textdescriptives.components.descriptive_stats.create_descriptive_stats_component"]], "extract_df() (in module textdescriptives.extractors)": [[3, "textdescriptives.extractors.extract_df"]], "extract_dict() (in module textdescriptives.extractors)": [[3, "textdescriptives.extractors.extract_dict"]], "extract_metrics() (in module textdescriptives.extractors)": [[3, "textdescriptives.extractors.extract_metrics"]], "create_information_theory_component() (in module textdescriptives.components.information_theory)": [[6, "textdescriptives.components.information_theory.create_information_theory_component"]], "create_pos_proportions_component() (in module textdescriptives.components.pos_proportions)": [[9, "textdescriptives.components.pos_proportions.create_pos_proportions_component"]], "alpha_ratio (textdescriptives.components.quality_data_classes.qualityoutput attribute)": [[10, "textdescriptives.components.quality_data_classes.QualityOutput.alpha_ratio"]], "alpha_ratio (textdescriptives.components.quality_data_classes.qualitythresholds attribute)": [[10, "textdescriptives.components.quality_data_classes.QualityThresholds.alpha_ratio"]], "contains (textdescriptives.components.quality_data_classes.qualityoutput attribute)": [[10, "textdescriptives.components.quality_data_classes.QualityOutput.contains"]], "contains (textdescriptives.components.quality_data_classes.qualitythresholds attribute)": [[10, "textdescriptives.components.quality_data_classes.QualityThresholds.contains"]], "create_quality_component() (in module textdescriptives.components.quality)": [[10, "textdescriptives.components.quality.create_quality_component"]], "doc_length (textdescriptives.components.quality_data_classes.qualityoutput attribute)": [[10, "textdescriptives.components.quality_data_classes.QualityOutput.doc_length"]], "doc_length (textdescriptives.components.quality_data_classes.qualitythresholds attribute)": [[10, "textdescriptives.components.quality_data_classes.QualityThresholds.doc_length"]], "duplicate_line_chr_fraction (textdescriptives.components.quality_data_classes.qualityoutput attribute)": [[10, "textdescriptives.components.quality_data_classes.QualityOutput.duplicate_line_chr_fraction"]], "duplicate_line_chr_fraction (textdescriptives.components.quality_data_classes.qualitythresholds attribute)": [[10, "textdescriptives.components.quality_data_classes.QualityThresholds.duplicate_line_chr_fraction"]], "duplicate_ngram_chr_fraction (textdescriptives.components.quality_data_classes.qualityoutput attribute)": [[10, "textdescriptives.components.quality_data_classes.QualityOutput.duplicate_ngram_chr_fraction"]], "duplicate_ngram_chr_fraction (textdescriptives.components.quality_data_classes.qualitythresholds attribute)": [[10, "textdescriptives.components.quality_data_classes.QualityThresholds.duplicate_ngram_chr_fraction"]], "duplicate_paragraph_chr_fraction (textdescriptives.components.quality_data_classes.qualityoutput attribute)": [[10, "textdescriptives.components.quality_data_classes.QualityOutput.duplicate_paragraph_chr_fraction"]], "duplicate_paragraph_chr_fraction (textdescriptives.components.quality_data_classes.qualitythresholds attribute)": [[10, "textdescriptives.components.quality_data_classes.QualityThresholds.duplicate_paragraph_chr_fraction"]], "mean_word_length (textdescriptives.components.quality_data_classes.qualityoutput attribute)": [[10, "textdescriptives.components.quality_data_classes.QualityOutput.mean_word_length"]], "mean_word_length (textdescriptives.components.quality_data_classes.qualitythresholds attribute)": [[10, "textdescriptives.components.quality_data_classes.QualityThresholds.mean_word_length"]], "n_stop_words (textdescriptives.components.quality_data_classes.qualityoutput attribute)": [[10, "textdescriptives.components.quality_data_classes.QualityOutput.n_stop_words"]], "n_stop_words (textdescriptives.components.quality_data_classes.qualitythresholds attribute)": [[10, "textdescriptives.components.quality_data_classes.QualityThresholds.n_stop_words"]], "oov_ratio (textdescriptives.components.quality_data_classes.qualityoutput attribute)": [[10, "textdescriptives.components.quality_data_classes.QualityOutput.oov_ratio"]], "oov_ratio (textdescriptives.components.quality_data_classes.qualitythresholds attribute)": [[10, "textdescriptives.components.quality_data_classes.QualityThresholds.oov_ratio"]], "passed (textdescriptives.components.quality_data_classes.qualityoutput property)": [[10, "textdescriptives.components.quality_data_classes.QualityOutput.passed"]], "passed (textdescriptives.components.quality_data_classes.thresholdsoutput property)": [[10, "textdescriptives.components.quality_data_classes.ThresholdsOutput.passed"]], "proportion_bullet_points (textdescriptives.components.quality_data_classes.qualityoutput attribute)": [[10, "textdescriptives.components.quality_data_classes.QualityOutput.proportion_bullet_points"]], "proportion_bullet_points (textdescriptives.components.quality_data_classes.qualitythresholds attribute)": [[10, "textdescriptives.components.quality_data_classes.QualityThresholds.proportion_bullet_points"]], "proportion_ellipsis (textdescriptives.components.quality_data_classes.qualityoutput attribute)": [[10, "textdescriptives.components.quality_data_classes.QualityOutput.proportion_ellipsis"]], "proportion_ellipsis (textdescriptives.components.quality_data_classes.qualitythresholds attribute)": [[10, "textdescriptives.components.quality_data_classes.QualityThresholds.proportion_ellipsis"]], "symbol_to_word_ratio (textdescriptives.components.quality_data_classes.qualityoutput attribute)": [[10, "textdescriptives.components.quality_data_classes.QualityOutput.symbol_to_word_ratio"]], "symbol_to_word_ratio (textdescriptives.components.quality_data_classes.qualitythresholds attribute)": [[10, "textdescriptives.components.quality_data_classes.QualityThresholds.symbol_to_word_ratio"]], "threshold (textdescriptives.components.quality_data_classes.thresholdsoutput attribute)": [[10, "textdescriptives.components.quality_data_classes.ThresholdsOutput.threshold"]], "to_flat_value_dict() (textdescriptives.components.quality_data_classes.qualityoutput method)": [[10, "textdescriptives.components.quality_data_classes.QualityOutput.to_flat_value_dict"]], "top_ngram_chr_fraction (textdescriptives.components.quality_data_classes.qualityoutput attribute)": [[10, "textdescriptives.components.quality_data_classes.QualityOutput.top_ngram_chr_fraction"]], "top_ngram_chr_fraction (textdescriptives.components.quality_data_classes.qualitythresholds attribute)": [[10, "textdescriptives.components.quality_data_classes.QualityThresholds.top_ngram_chr_fraction"]], "value (textdescriptives.components.quality_data_classes.thresholdsoutput attribute)": [[10, "textdescriptives.components.quality_data_classes.ThresholdsOutput.value"]], "create_readability_component() (in module textdescriptives.components.readability)": [[11, "textdescriptives.components.readability.create_readability_component"]]}}) \ No newline at end of file diff --git a/tutorials/filter_corpus_using_quality.html b/tutorials/filter_corpus_using_quality.html index c3f1d90e..574fe60e 100644 --- a/tutorials/filter_corpus_using_quality.html +++ b/tutorials/filter_corpus_using_quality.html @@ -294,7 +294,7 @@

The Data{"version_major": 2, "version_minor": 0, "model_id": "e7afd2cc69914b98bca522a165e7fcca"} +
@@ -857,7 +857,7 @@

Comparing across domains - + diff --git a/tutorials/introductory_tutorial.html b/tutorials/introductory_tutorial.html index aa1c519f..f96233b8 100644 --- a/tutorials/introductory_tutorial.html +++ b/tutorials/introductory_tutorial.html @@ -385,25 +385,25 @@

The easy way: Extract metrics with 0 ham Go until jurong point, crazy.. Available only ... - 4.150000 - 5.0 - 1.904600 - 10.0 - 10.0 - 5.0 - 1.250000 - 1.0 - ... + 2.333333 + 0.833333 0.388889 0.055556 90.935000 3.060000 NaN 4.0 - 3.116500 - 5.642000 - 15.000000 - 0.5 + ... + 10.0 + 5.0 + 1.250000 + 1.0 + 0.433013 + 20 + 20 + 1.000000 + 92 + 2 1 ham Ok lar... Joking wif u oni... - 3.000000 - 3.0 - 1.527525 - 3.0 - 3.0 - 1.0 - 1.166667 - 1.0 - ... + 1.333333 + 0.666667 0.433333 0.233333 105.090000 -0.653333 NaN 1.2 - -5.800000 - -8.026667 - 3.000000 - 0.0 + ... + 3.0 + 1.0 + 1.166667 + 1.0 + 0.372678 + 6 + 6 + 1.000000 + 24 + 2 2 spam Free entry in 2 a wkly comp to win FA Cup fina... - 4.392857 - 4.0 - 3.298693 - 14.0 - 14.0 - 2.0 - 1.178571 - 1.0 - ... + 2.516807 + 0.659664 0.521008 0.050420 92.917857 3.777143 NaN 5.6 - 6.260357 - 7.915714 - 28.285714 + ... + 14.0 2.0 + 1.178571 + 1.0 + 0.382993 + 28 + 24 + 0.857143 + 128 + 2 3 ham U dun say so early hor... U c already then say... - 3.000000 - 3.0 - 1.758098 - 5.5 - 5.5 - 0.5 - 1.000000 - 1.0 - ... + 1.452381 + 0.119048 0.535714 0.035714 116.652500 -1.645000 NaN 2.2 - -4.550000 - -3.541818 - 14.590909 + ... + 5.5 0.5 + 1.000000 + 1.0 + 0.000000 + 11 + 9 + 0.818182 + 39 + 2 4 ham Nah I don't think he goes to usf, he lives aro... - 3.461538 - 3.0 - 1.598816 - 13.0 - 13.0 - 0.0 - 1.000000 - 1.0 - ... + 2.066667 + 0.000000 0.400000 0.000000 109.040000 1.280000 NaN 5.2 - 1.373846 - 2.276923 - 13.000000 + ... + 13.0 0.0 + 1.000000 + 1.0 + 0.000000 + 13 + 12 + 0.923077 + 49 + 1 @@ -557,7 +557,7 @@

The configurable way: Add pipes to spaCy -