From be7437aa2a77c650103a7ee093deacb8bc667e7f Mon Sep 17 00:00:00 2001 From: Weizheng Lu Date: Thu, 9 May 2024 23:10:56 +0800 Subject: [PATCH] add github pages docs (#6) --- .github/workflows/deploy.yml | 42 +++++++++++++ docs/README.md | 23 +++++++ docs/_static/custom.css | 3 + docs/_static/logo.ico | Bin 0 -> 4022 bytes docs/_toc.yml | 10 +++ docs/conf.py | 49 +++++++++++++++ docs/dataframe/dataframe.md | 6 ++ docs/index.md | 26 ++++++++ docs/perf/ad-hoc-query.md | 2 + docs/requirements-doc.txt | 11 ++++ tpch/cudf_queries/queries.py | 108 ++++++++++++++++----------------- tpch/dask_queries/queries.py | 2 +- tpch/pandas_queries/queries.py | 7 ++- 13 files changed, 228 insertions(+), 61 deletions(-) create mode 100644 .github/workflows/deploy.yml create mode 100644 docs/README.md create mode 100644 docs/_static/custom.css create mode 100644 docs/_static/logo.ico create mode 100644 docs/_toc.yml create mode 100644 docs/conf.py create mode 100644 docs/dataframe/dataframe.md create mode 100644 docs/index.md create mode 100644 docs/perf/ad-hoc-query.md create mode 100644 docs/requirements-doc.txt diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml new file mode 100644 index 0000000..95626a1 --- /dev/null +++ b/.github/workflows/deploy.yml @@ -0,0 +1,42 @@ +name: Deploy Docs + +on: + push: + tags: + - '*' + +jobs: + build: + runs-on: ubuntu-latest + permissions: + pages: write + id-token: write + strategy: + fail-fast: false + matrix: + python-version: ["3.11"] + + steps: + - uses: actions/checkout@v3 + - name: Set up Python envs + uses: actions/setup-python@v3 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + python -m pip install --upgrade pip + python -m pip install -r requirements-web.txt + - name: Build website + run: | + sphinx-build -b html ./ ./_build/html + touch ./_build/html/.nojekyll + # Upload the book's HTML as an artifact + - name: Upload artifact + uses: actions/upload-pages-artifact@v2 + with: + path: "_build/html" + + # Deploy the book's HTML to GitHub Pages + - name: Deploy to GitHub Pages + id: deployment + uses: actions/deploy-pages@v2 diff --git a/docs/README.md b/docs/README.md new file mode 100644 index 0000000..99b3255 --- /dev/null +++ b/docs/README.md @@ -0,0 +1,23 @@ +# Build the Doc + +## Environment Setup + +Install the dependencies listed in `requirements-doc.txt`. This includes tools to build this doc. + +Navigate to the project folder and build the project: + +```bash +cd docs +sphinx-build -b html ./ ./_build/html +``` + +Web-related files will be generated in the `docs/_build` directory. + +## Start HTTP Server + +After building the HTML files, you can use the built-in HTTP Server in Python and open http://127.0.0.1:8000 in your browser to view the result: + +```bash +cd _build/html +python -m http.server 8000 +``` \ No newline at end of file diff --git a/docs/_static/custom.css b/docs/_static/custom.css new file mode 100644 index 0000000..194630d --- /dev/null +++ b/docs/_static/custom.css @@ -0,0 +1,3 @@ +html[data-theme="light"] { + --sbt-color-announcement: rgb(125, 125, 125); +} \ No newline at end of file diff --git a/docs/_static/logo.ico b/docs/_static/logo.ico new file mode 100644 index 0000000000000000000000000000000000000000..5169b6d6eed8f1609547423dd546f91ae9cf28c9 GIT binary patch literal 4022 zcmeHKYjjlA72Y#>OcFvKK%SL{4_a-bSfx@Bi;p}>cq9Z3VnCju#E^s}BtS?Y1VRXe z5dlHo1wlXym`EsTE868+uC}!Lqjuq1UA3jP{?JuKQ|`UzvHRV5b&`oo`k$Ni%{lkX zoc+z-=lk}Z5IOKSW)#`4bR?cAnTYNM#sHH6d);&4y}KXq!n<($)n5V5>Lwrx$O6hZ z`^^cQcP4^!-|&BA&{0)ok*dsd%9W3gmvs}%WWx`$<=XE@$u${vJR6BXF|Y`@2k@*1 z2lK5tFxCTK1I#%;|D5w*-LGfte^}3Qhv~xI!CO3YRh2`s`bkpNnbT#>luzZVM_8`8 zzfZ0kyW02{IXJT?7O{>3(i1bQ`vZ&_K)+b_J&+1~1TbfxXU;ptJnb;+!abkq;%zVL zl8qUdRdS&e-mJJWGZ2TU}=FwMWha-ibx;P2(Zg^taalKfF zv0n{*_cs5ZIjf5~_jTriW2|Sq!pNQV(`~GuZMvrC)wZbxDS_9{KS$O) z_O+}ZkKBL2vguBi8`7>?os`U(7xNI)0{qiJ5o7A^gYDtm67DxWo4R%eBy?uTXFk6ErB$EZ8Y^>?t?nDjEIvdxhX z+!}w*S>o(-xtw>pj|+~np1GHG@ebC{Y+^mPk@dV9RtuN2TC#xEPoL4MV*2l7^;5;N z_R(vy{vnoY??c_kuxv^BQnU_xTrH@$Wq$iW!3&&8;QZt7>FEbp7wtsd+mZWv#$(Xri8g-Q=;d5Z?*n@LE9b! zcLp@~sZVpLdmXCZA zvBPOuyB*!Kwy2W}j$532#_MO>(|M=d0M7c9pEfY+r?(K!n{-J_F*IieG`IA5Xs+Z6 z&OHfvlUy|+M>afk!>>8fl8oHr{)*f;h)%~%u``^lmm}HQ8z;r)iTLg{@G6X#mE z%k8c6t^jZFM9*DI;7-w{t3K$jxgtZY$S>s5Mw0apA3&cA(p(~owwPn6d$(|hv9&vb zt$k5!9g1P=SPYv#OJcG8QRucDdTw^>dEl7S984RS>(V+BWu*kKw?cFH?(@8wTT#H! z+!37lIMn?g-acn*ZUpKcdtGdeScbZPBwn&}cd>OKnyn+TY`q%C=II1B&nB{U5<1y1 z&29YNN$6t%&VduPsEoipK`mMM1vJMV%~fTytog~`;PvCsoc8oNkLKc8Y>Da-Tb=RZ z#jx?#?#P4ID^Xuqhle2dp=_RvXY=)8Y@Qpz=3B{Zo*(7@&s8GP{=`7qz&&~lsl`j8 z;q@+6UTk}v(Q6rg({l9#BGBh-&9y}f(H=fuv`3P)&qdZ=SA=yi`Z4Qh>G3=~)KIw*eA;mI&k@hR2_d9nPz3BrfL+(|@ zGy?Z%kJndD;m|$@&24cpc$+Yi=;$A!@89Fe>`EcCCpFsaPMK)*q~;jE9y!A3PNu%U zKhpohgWRdg9O9Y#?PuIhSr`0l^=Ync2)M)kDYj7tJRcGPTJ(G;(doPw?>7N2!5_2X zk0{~x)+%IE(1KYziST=d@LYcqn*S>BZ1rodJ(3agpm0+d;s)*XPT>Jt3-`c_SFBy| zqZc==J&`M|83e6Fh3px)0H34R{3uSYzW?n3nq$%7{0h8THn*K?@Z~xGLooyoR9T1M z1KSG$#C!-}CIP`)2A!kF5PU|m@w=7wvo)Z(t>Ne=j!WQ;MSVl|+Whc5L4S?rDR{p> zJ~m&APX|Kw4Yq*aQUva?_)YV%X!U7so6`VqDS9n^(^*3EWS$=}$?Qt{%=TUYZzkxJfii`?tM8#A!(!JDCbGP{$V@Olya)@%GS{R+Il z9X%ioJ|y$)Z}xw19D9!bMxrGZJu5N>=N>0I$!VYaw_V2jchb#2-ABf|BV&wrM#dYz z85L%{i+_}EU$-mZs{s41Pk3Yek}WYQ>5!Ukq1T;xok+>d-0%4 e(`2AIo~hZz)CA}-UJq;l-q9|;#yk8x{_($}!jTyO literal 0 HcmV?d00001 diff --git a/docs/_toc.yml b/docs/_toc.yml new file mode 100644 index 0000000..9a067f7 --- /dev/null +++ b/docs/_toc.yml @@ -0,0 +1,10 @@ +format: jb-book +root: index +parts: +- caption: DataFrame + chapters: + - file: dataframe/dataframe + +- caption: Performance + chapters: + - file: perf/ad-hoc-query \ No newline at end of file diff --git a/docs/conf.py b/docs/conf.py new file mode 100644 index 0000000..31a7fdf --- /dev/null +++ b/docs/conf.py @@ -0,0 +1,49 @@ +author = 'DF-Eval Team' +bibtex_bibfiles = ['references.bib'] +bibtex_reference_style = 'author_year' +comments_config = {'hypothesis': False, 'utterances': False} +copyright = '2023-2024' +exclude_patterns = ['**.ipynb_checkpoints', '.DS_Store', 'Thumbs.db', '_build'] +extensions = ['sphinx_togglebutton', 'sphinx_copybutton', 'myst_nb', 'jupyter_book', 'sphinx_thebe', 'sphinx_comments', 'sphinx_external_toc', 'sphinx.ext.intersphinx', 'sphinx_design', 'sphinx_book_theme', 'sphinxcontrib.bibtex', 'sphinx_jupyterbook_latex'] +external_toc_exclude_missing = True +external_toc_path = '_toc.yml' +html_baseurl = '' +html_favicon = "_static/logo.ico" +html_logo = 'logo.svg' +html_sourcelink_suffix = '' +html_theme = 'sphinx_book_theme' +html_theme_options = { + 'search_bar_text': 'Search...', + 'path_to_docs': 'docs', + 'repository_url': 'https://github.com/godaai/df-eval', + 'repository_branch': 'main', + 'extra_footer': '', + 'home_page_in_toc': True, + 'announcement': "If you find this page helpful,please star us on GitHub.", + 'analytics': {'google_analytics_id': ''}, + 'use_repository_button': True, + 'use_edit_page_button': False, + 'use_issues_button': False, + "toc_title": "In this page", +} +html_static_path = ["_static"] +html_css_files = ["custom.css"] +html_js_files = [ + "https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.6/require.min.js", +] +html_title = 'DF-Eval' +latex_engine = 'pdflatex' +myst_enable_extensions = ['colon_fence', 'dollarmath', 'linkify', 'substitution', 'tasklist'] +myst_url_schemes = ['mailto', 'http', 'https'] +nb_execution_allow_errors = False +nb_execution_cache_path = '' +nb_execution_excludepatterns = [] +nb_execution_in_temp = False +nb_execution_mode = 'off' +nb_execution_timeout = 30 +nb_output_stderr = 'show' +numfig = False +pygments_style = 'sphinx' +suppress_warnings = ['myst.domains'] +use_jupyterbook_latex = True +use_multitoc_numbering = True diff --git a/docs/dataframe/dataframe.md b/docs/dataframe/dataframe.md new file mode 100644 index 0000000..20d88b3 --- /dev/null +++ b/docs/dataframe/dataframe.md @@ -0,0 +1,6 @@ +(sec-dataframe)= +# What's DataFrame + +In recent years, the convenience of DataFrames has made them the tool of choice for data scientists to perform a variety of tasks, from data loading, cleaning, and wrangling to statistical modeling and visualization. [pandas](https://pandas.pydata.org/), the most popular DataFrame system, is the de facto standard. It is widely used by data scientists as it is easy to use, even users with little or no experience in Python programming can quickly learn how to do data analysis. It provides a powerful set of tools for data manipulation, including filtering, merging, grouping, and aggregating data. + +## DataFrame Algebra \ No newline at end of file diff --git a/docs/index.md b/docs/index.md new file mode 100644 index 0000000..0259eed --- /dev/null +++ b/docs/index.md @@ -0,0 +1,26 @@ +# DF-Eval + +::::{grid} 2 +:reverse: + +:::{grid-item} +:columns: 3 +:class: sd-m-auto + + +::: + +:::{grid-item} +:columns: 9 +:class: sd-fs-3 + +A multi-level, multi-dimensional evaluation suite for DataFrame systems. + +% The SVG rendering breaks latex builds for the GitHub badge, so only include in HTML +```{only} html +[![](https://img.shields.io/github/stars/godaai/df-eval?style=for-the-badge)](https://github.com/godaai/dv-eval) +``` + +::: + +:::: \ No newline at end of file diff --git a/docs/perf/ad-hoc-query.md b/docs/perf/ad-hoc-query.md new file mode 100644 index 0000000..1e4e346 --- /dev/null +++ b/docs/perf/ad-hoc-query.md @@ -0,0 +1,2 @@ +(sec-ad-hoc-query)= +# Ad-hoc Query \ No newline at end of file diff --git a/docs/requirements-doc.txt b/docs/requirements-doc.txt new file mode 100644 index 0000000..f8bbe4a --- /dev/null +++ b/docs/requirements-doc.txt @@ -0,0 +1,11 @@ +jupyter-book +sphinx_togglebutton +sphinx_copybutton +myst_nb +sphinx_comments +sphinx_external_toc +sphinx_design +sphinx_book_theme +sphinxcontrib-bibtex +sphinx-jupyterbook-latex +sphinxcontrib-jsmath \ No newline at end of file diff --git a/tpch/cudf_queries/queries.py b/tpch/cudf_queries/queries.py index 34ccf8f..8cc5033 100644 --- a/tpch/cudf_queries/queries.py +++ b/tpch/cudf_queries/queries.py @@ -1,7 +1,6 @@ import cudf import cudf.pandas cudf.pandas.install() -import pandas as pd import argparse import json @@ -10,9 +9,7 @@ import traceback from typing import Dict -import sys - -# import pandas as pd +import pandas as pd from common_utils import log_time_fn, parse_common_arguments, print_result_fn dataset_dict = {} @@ -128,7 +125,7 @@ def q01(root: str, storage_options: Dict): ], ] sel = lineitem_filtered.L_SHIPDATE <= date - lineitem_filtered = lineitem_filtered[sel] + lineitem_filtered = lineitem_filtered.loc[sel] lineitem_filtered["AVG_QTY"] = lineitem_filtered.L_QUANTITY lineitem_filtered["AVG_PRICE"] = lineitem_filtered.L_EXTENDEDPRICE lineitem_filtered["DISC_PRICE"] = lineitem_filtered.L_EXTENDEDPRICE * ( @@ -316,7 +313,7 @@ def q03(root: str, storage_options: Dict): :, ["L_ORDERKEY", "REVENUE", "O_ORDERDATE", "O_SHIPPRIORITY"] ] - # [change 1]Convert cudf DataFrame to Pandas DataFrame and format timestamp + # [DIFF] Convert cudf DataFrame to Pandas DataFrame and format timestamp total["O_ORDERDATE"] = pd.to_datetime(total["O_ORDERDATE"]).dt.strftime("%Y-%m-%d") return total @@ -408,8 +405,8 @@ def q07(root: str, storage_options: Dict): (lineitem["L_SHIPDATE"] >= pd.Timestamp("1995-01-01")) & (lineitem["L_SHIPDATE"] < pd.Timestamp("1997-01-01")) ] - lineitem_filtered["L_YEAR"] = lineitem_filtered["L_SHIPDATE"].dt.year - lineitem_filtered["VOLUME"] = lineitem_filtered["L_EXTENDEDPRICE"] * ( + lineitem_filtered.loc[:, "L_YEAR"] = lineitem_filtered["L_SHIPDATE"].dt.year + lineitem_filtered.loc[:, "VOLUME"] = lineitem_filtered["L_EXTENDEDPRICE"] * ( 1.0 - lineitem_filtered["L_DISCOUNT"] ) lineitem_filtered = lineitem_filtered.loc[ @@ -704,11 +701,11 @@ def g2(x): columns={"g1": "HIGH_LINE_COUNT", "g2": "LOW_LINE_COUNT"} ) - # Round the result to one decimal place -- If you use test_result.py to test the results, please uncomment the following two lines. + # Round the result to one decimal + # If you use test_result.py to test the results, please uncomment the following two lines. # total["HIGH_LINE_COUNT"] = total["HIGH_LINE_COUNT"].astype(float).round(1) # total["LOW_LINE_COUNT"] = total["LOW_LINE_COUNT"].astype(float).round(1) - return total @@ -730,14 +727,11 @@ def q13(root: str, storage_options: Dict): count_df = c_o_merged.groupby(["C_CUSTKEY"], as_index=False).agg( C_COUNT=pd.NamedAgg(column="O_ORDERKEY", aggfunc="count") ) - total = count_df.groupby(["C_COUNT"], as_index=False).size() - # [change 3] for TypeError: Series.sort_values() got an unexpected keyword argument 'by' - # the error is caused here: in cuDF,DataFrameGroupBy.size() Return the size of each group. https://docs.rapids.ai/api/cudf/stable/user_guide/api_docs/api/cudf.core.groupby.groupby.dataframegroupby.size/# - # while in pandas, DataFrameGroupBy.size() Returns DataFrame or Series, Number of rows in each group as a Series if as_index is True or a DataFrame if as_index is False. https://pandas.pydata.org/docs/reference/api/pandas.core.groupby.DataFrameGroupBy.size.html#pandas.core.groupby.DataFrameGroupBy.size + + # [DIFF] groupby.agg is a `Series` and convert `Series` to `DataFrame` total = total.reset_index(name='size') total.columns = ["C_COUNT", "CUSTDIST"] - total = total.sort_values( by=["CUSTDIST", "C_COUNT"], ascending=[False, False], @@ -811,18 +805,18 @@ def q16(root: str, storage_options: Dict): partsupp = load_partsupp(root, storage_options) supplier = load_supplier(root, storage_options) - BRAND = "Brand#45" - TYPE = "MEDIUM POLISHED" - SIZE_LIST = [49, 14, 23, 45, 19, 3, 36, 9] + brand = "Brand#45" + p_type = "MEDIUM POLISHED" + size_list = [49, 14, 23, 45, 19, 3, 36, 9] # Merge part and partsupp DataFrames merged_df = pd.merge(part, partsupp, left_on="P_PARTKEY", right_on="PS_PARTKEY", how="inner") # Apply filters filtered_df = merged_df[ - (merged_df["P_BRAND"] != BRAND) & - (~merged_df["P_TYPE"].str.startswith(TYPE)) & - (merged_df["P_SIZE"].isin(SIZE_LIST)) + (merged_df["P_BRAND"] != brand) & + (~merged_df["P_TYPE"].str.startswith(p_type)) & + (merged_df["P_SIZE"].isin(size_list)) ] # Exclude unwanted suppliers @@ -887,7 +881,7 @@ def q18(root: str, storage_options: Dict): total = gb2.sort_values(["O_TOTALPRICE", "O_ORDERDATE"], ascending=[False, True]) total = total.head(100) - # [change 2]Convert cudf DataFrame to Pandas DataFrame and format timestamp + # [DIFF] Convert cudf DataFrame to Pandas DataFrame and format timestamp total["O_ORDERDATE"] = pd.to_datetime(total["O_ORDERDATE"]).dt.strftime("%Y-%m-%d") return total @@ -959,42 +953,42 @@ def q19(root: str, storage_options: Dict): jn = flineitem.merge(fpart, left_on="L_PARTKEY", right_on="P_PARTKEY") jnsel = ( ( - (jn.P_BRAND == brand1) - & ( - (jn.P_CONTAINER == "SM BOX") - | (jn.P_CONTAINER == "SM CASE") - | (jn.P_CONTAINER == "SM PACK") - | (jn.P_CONTAINER == "SM PKG") - ) - & (jn.L_QUANTITY >= quantity1) - & (jn.L_QUANTITY <= quantity1 + 10) - & (jn.P_SIZE <= 5) - ) - | - ( - (jn.P_BRAND == brand2) - & ( - (jn.P_CONTAINER == "MED BAG") - | (jn.P_CONTAINER == "MED BOX") - | (jn.P_CONTAINER == "MED PACK") - | (jn.P_CONTAINER == "MED PKG") - ) - & (jn.L_QUANTITY >= quantity2) - & (jn.L_QUANTITY <= quantity2 + 10) - & (jn.P_SIZE <= 10) + (jn.P_BRAND == brand1) + & ( + (jn.P_CONTAINER == "SM BOX") + | (jn.P_CONTAINER == "SM CASE") + | (jn.P_CONTAINER == "SM PACK") + | (jn.P_CONTAINER == "SM PKG") + ) + & (jn.L_QUANTITY >= quantity1) + & (jn.L_QUANTITY <= quantity1 + 10) + & (jn.P_SIZE <= 5) + ) + | + ( + (jn.P_BRAND == brand2) + & ( + (jn.P_CONTAINER == "MED BAG") + | (jn.P_CONTAINER == "MED BOX") + | (jn.P_CONTAINER == "MED PACK") + | (jn.P_CONTAINER == "MED PKG") + ) + & (jn.L_QUANTITY >= quantity2) + & (jn.L_QUANTITY <= quantity2 + 10) + & (jn.P_SIZE <= 10) ) | ( - (jn.P_BRAND == brand3) - & ( - (jn.P_CONTAINER == "LG BOX") - | (jn.P_CONTAINER == "LG CASE") - | (jn.P_CONTAINER == "LG PACK") - | (jn.P_CONTAINER == "LG PKG") - ) - & (jn.L_QUANTITY >= quantity3) - & (jn.L_QUANTITY <= quantity3 + 10) - & (jn.P_SIZE <= 15) + (jn.P_BRAND == brand3) + & ( + (jn.P_CONTAINER == "LG BOX") + | (jn.P_CONTAINER == "LG CASE") + | (jn.P_CONTAINER == "LG PACK") + | (jn.P_CONTAINER == "LG PKG") + ) + & (jn.L_QUANTITY >= quantity3) + & (jn.L_QUANTITY <= quantity3 + 10) + & (jn.P_SIZE <= 15) ) ) jn = jn[jnsel] @@ -1104,7 +1098,7 @@ def q21(root: str, storage_options: Dict): ) total = total.loc[:, ["S_NAME"]] total = total.groupby("S_NAME", as_index=False).size() - # [change 4] add reset_index for the same error in q13 + # [DIFF] groupby.add `Series` to `DataFrame` total = total.reset_index(name='size') total.columns = ["S_NAME", "NUMWAIT"] total = total.sort_values(by=["NUMWAIT", "S_NAME"], ascending=[False, True]) @@ -1145,7 +1139,7 @@ def q22(root: str, storage_options: Dict): ) customer_selected = customer_selected.loc[:, ["CNTRYCODE", "C_ACCTBAL"]] agg1 = customer_selected.groupby(["CNTRYCODE"], as_index=False).size() - # [change 5] add reset_index for the same error in q13 + # [DIFF] groupby.add `Series` to `DataFrame` agg1 = agg1.reset_index(name='size') agg1.columns = ["CNTRYCODE", "NUMCUST"] diff --git a/tpch/dask_queries/queries.py b/tpch/dask_queries/queries.py index 7274e07..ee0ddb1 100644 --- a/tpch/dask_queries/queries.py +++ b/tpch/dask_queries/queries.py @@ -1359,7 +1359,7 @@ def main(): if args.endpoint == "local" or args.endpoint is None: from dask.distributed import LocalCluster - client = LocalCluster() + client = Client(LocalCluster()) elif args.endpoint: client = Client(args.endpoint) diff --git a/tpch/pandas_queries/queries.py b/tpch/pandas_queries/queries.py index e24bd99..440c302 100644 --- a/tpch/pandas_queries/queries.py +++ b/tpch/pandas_queries/queries.py @@ -695,7 +695,8 @@ def g2(x): columns={"g1": "HIGH_LINE_COUNT", "g2": "LOW_LINE_COUNT"} ) - # Round the result to one decimal place -- If you use test_result.py to test the results, please uncomment the following two lines. + # Round the result to one decimal + # If you use test_result.py to test the results, please uncomment the following two lines. # total["HIGH_LINE_COUNT"] = total["HIGH_LINE_COUNT"].astype(float).round(1) # total["LOW_LINE_COUNT"] = total["LOW_LINE_COUNT"].astype(float).round(1) @@ -796,7 +797,7 @@ def q16(root: str, storage_options: Dict): supplier = load_supplier(root, storage_options) brand = "Brand#45" - type = "MEDIUM POLISHED" + p_type = "MEDIUM POLISHED" size_list = [49, 14, 23, 45, 19, 3, 36, 9] # Merge part and partsupp DataFrames @@ -805,7 +806,7 @@ def q16(root: str, storage_options: Dict): # Apply filters filtered_df = merged_df[ (merged_df["P_BRAND"] != brand) & - (~merged_df["P_TYPE"].str.startswith(type)) & + (~merged_df["P_TYPE"].str.startswith(p_type)) & (merged_df["P_SIZE"].isin(size_list)) ]