add github pages docs (#6)

godaai · May 9, 2024 · be7437a · be7437a
1 parent fbcfd52
commit be7437a
Show file tree

Hide file tree

Showing 13 changed files with 228 additions and 61 deletions.
diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml
@@ -0,0 +1,42 @@
+name: Deploy Docs
+
+on:
+  push:
+    tags:        
+      - '*' 
+
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    permissions:
+      pages: write
+      id-token: write
+    strategy:
+      fail-fast: false
+      matrix:
+        python-version: ["3.11"]
+
+    steps:
+    - uses: actions/checkout@v3
+    - name: Set up Python envs
+      uses: actions/setup-python@v3
+      with:
+        python-version: ${{ matrix.python-version }}
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        python -m pip install -r requirements-web.txt
+    - name: Build website
+      run: |
+        sphinx-build -b html ./ ./_build/html
+        touch ./_build/html/.nojekyll
+    # Upload the book's HTML as an artifact
+    - name: Upload artifact
+      uses: actions/upload-pages-artifact@v2
+      with:
+        path: "_build/html"
+
+    # Deploy the book's HTML to GitHub Pages
+    - name: Deploy to GitHub Pages
+      id: deployment
+      uses: actions/deploy-pages@v2
diff --git a/docs/README.md b/docs/README.md
@@ -0,0 +1,23 @@
+# Build the Doc
+
+## Environment Setup
+
+Install the dependencies listed in `requirements-doc.txt`. This includes tools to build this doc.
+
+Navigate to the project folder and build the project:
+
+```bash
+cd docs
+sphinx-build -b html ./ ./_build/html
+```
+
+Web-related files will be generated in the `docs/_build` directory.
+
+## Start HTTP Server
+
+After building the HTML files, you can use the built-in HTTP Server in Python and open http://127.0.0.1:8000 in your browser to view the result:
+
+```bash
+cd _build/html
+python -m http.server 8000
+```
diff --git a/docs/_static/custom.css b/docs/_static/custom.css
@@ -0,0 +1,3 @@
+html[data-theme="light"] {
+    --sbt-color-announcement: rgb(125, 125, 125);
+}
diff --git a/docs/_static/logo.ico b/docs/_static/logo.ico
diff --git a/docs/_toc.yml b/docs/_toc.yml
@@ -0,0 +1,10 @@
+format: jb-book
+root: index
+parts:
+- caption: DataFrame
+  chapters:
+  - file: dataframe/dataframe
+
+- caption: Performance
+  chapters:
+  - file: perf/ad-hoc-query
diff --git a/docs/conf.py b/docs/conf.py
@@ -0,0 +1,49 @@
+author = 'DF-Eval Team'
+bibtex_bibfiles = ['references.bib']
+bibtex_reference_style = 'author_year'
+comments_config = {'hypothesis': False, 'utterances': False}
+copyright = '2023-2024'
+exclude_patterns = ['**.ipynb_checkpoints', '.DS_Store', 'Thumbs.db', '_build']
+extensions = ['sphinx_togglebutton', 'sphinx_copybutton', 'myst_nb', 'jupyter_book', 'sphinx_thebe', 'sphinx_comments', 'sphinx_external_toc', 'sphinx.ext.intersphinx', 'sphinx_design', 'sphinx_book_theme', 'sphinxcontrib.bibtex', 'sphinx_jupyterbook_latex']
+external_toc_exclude_missing = True
+external_toc_path = '_toc.yml'
+html_baseurl = ''
+html_favicon = "_static/logo.ico"
+html_logo = 'logo.svg'
+html_sourcelink_suffix = ''
+html_theme = 'sphinx_book_theme'
+html_theme_options = {
+    'search_bar_text': 'Search...', 
+    'path_to_docs': 'docs', 
+    'repository_url': 'https://github.com/godaai/df-eval', 
+    'repository_branch': 'main', 
+    'extra_footer': '', 
+    'home_page_in_toc': True, 
+    'announcement': "If you find this page helpful，please star us on <a href=\"https://github.com/godaai/df-eval\">GitHub</a>.", 
+    'analytics': {'google_analytics_id': ''}, 
+    'use_repository_button': True, 
+    'use_edit_page_button': False, 
+    'use_issues_button': False,
+    "toc_title": "In this page",
+}
+html_static_path = ["_static"]
+html_css_files = ["custom.css"]
+html_js_files = [
+    "https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.6/require.min.js",
+]
+html_title = 'DF-Eval'
+latex_engine = 'pdflatex'
+myst_enable_extensions = ['colon_fence', 'dollarmath', 'linkify', 'substitution', 'tasklist']
+myst_url_schemes = ['mailto', 'http', 'https']
+nb_execution_allow_errors = False
+nb_execution_cache_path = ''
+nb_execution_excludepatterns = []
+nb_execution_in_temp = False
+nb_execution_mode = 'off'
+nb_execution_timeout = 30
+nb_output_stderr = 'show'
+numfig = False
+pygments_style = 'sphinx'
+suppress_warnings = ['myst.domains']
+use_jupyterbook_latex = True
+use_multitoc_numbering = True
diff --git a/docs/dataframe/dataframe.md b/docs/dataframe/dataframe.md
@@ -0,0 +1,6 @@
+(sec-dataframe)=
+# What's DataFrame
+
+In recent years, the convenience of DataFrames has made them the tool of choice for data scientists to perform a variety of tasks, from data loading, cleaning, and wrangling to statistical modeling and visualization. [pandas](https://pandas.pydata.org/), the most popular DataFrame system, is the de facto standard. It is widely used by data scientists as it is easy to use, even users with little or no experience in Python programming can quickly learn how to do data analysis. It provides a powerful set of tools for data manipulation, including filtering, merging, grouping, and aggregating data. 
+
+## DataFrame Algebra
diff --git a/docs/index.md b/docs/index.md
@@ -0,0 +1,26 @@
+# DF-Eval
+
+::::{grid} 2
+:reverse:
+
+:::{grid-item}
+:columns: 3
+:class: sd-m-auto
+
+
+:::
+
+:::{grid-item}
+:columns: 9
+:class: sd-fs-3
+
+A multi-level, multi-dimensional evaluation suite for DataFrame systems.
+
+% The SVG rendering breaks latex builds for the GitHub badge, so only include in HTML
+```{only} html
+[![](https://img.shields.io/github/stars/godaai/df-eval?style=for-the-badge)](https://github.com/godaai/dv-eval)
+```
+
+:::
+
+::::
diff --git a/docs/perf/ad-hoc-query.md b/docs/perf/ad-hoc-query.md
@@ -0,0 +1,2 @@
+(sec-ad-hoc-query)=
+# Ad-hoc Query
diff --git a/docs/requirements-doc.txt b/docs/requirements-doc.txt
@@ -0,0 +1,11 @@
+jupyter-book
+sphinx_togglebutton
+sphinx_copybutton
+myst_nb
+sphinx_comments
+sphinx_external_toc
+sphinx_design
+sphinx_book_theme
+sphinxcontrib-bibtex
+sphinx-jupyterbook-latex
+sphinxcontrib-jsmath
diff --git a/tpch/cudf_queries/queries.py b/tpch/cudf_queries/queries.py
@@ -1,7 +1,6 @@
 import cudf
 import cudf.pandas
 cudf.pandas.install()
-import pandas as pd
 
 import argparse
 import json
@@ -10,9 +9,7 @@
 import traceback
 from typing import Dict
 
-import sys
-
-# import pandas as pd
+import pandas as pd
 from common_utils import log_time_fn, parse_common_arguments, print_result_fn
 
 dataset_dict = {}
@@ -128,7 +125,7 @@ def q01(root: str, storage_options: Dict):
         ],
     ]
     sel = lineitem_filtered.L_SHIPDATE <= date
-    lineitem_filtered = lineitem_filtered[sel]
+    lineitem_filtered = lineitem_filtered.loc[sel]
     lineitem_filtered["AVG_QTY"] = lineitem_filtered.L_QUANTITY
     lineitem_filtered["AVG_PRICE"] = lineitem_filtered.L_EXTENDEDPRICE
     lineitem_filtered["DISC_PRICE"] = lineitem_filtered.L_EXTENDEDPRICE * (
@@ -316,7 +313,7 @@ def q03(root: str, storage_options: Dict):
         :, ["L_ORDERKEY", "REVENUE", "O_ORDERDATE", "O_SHIPPRIORITY"]
     ]
 
-    # [change 1]Convert cudf DataFrame to Pandas DataFrame and format timestamp
+    # [DIFF] Convert cudf DataFrame to Pandas DataFrame and format timestamp
     total["O_ORDERDATE"] = pd.to_datetime(total["O_ORDERDATE"]).dt.strftime("%Y-%m-%d")
     return total
 
@@ -408,8 +405,8 @@ def q07(root: str, storage_options: Dict):
         (lineitem["L_SHIPDATE"] >= pd.Timestamp("1995-01-01"))
         & (lineitem["L_SHIPDATE"] < pd.Timestamp("1997-01-01"))
     ]
-    lineitem_filtered["L_YEAR"] = lineitem_filtered["L_SHIPDATE"].dt.year
-    lineitem_filtered["VOLUME"] = lineitem_filtered["L_EXTENDEDPRICE"] * (
+    lineitem_filtered.loc[:, "L_YEAR"] = lineitem_filtered["L_SHIPDATE"].dt.year
+    lineitem_filtered.loc[:, "VOLUME"] = lineitem_filtered["L_EXTENDEDPRICE"] * (
         1.0 - lineitem_filtered["L_DISCOUNT"]
     )
     lineitem_filtered = lineitem_filtered.loc[
@@ -704,11 +701,11 @@ def g2(x):
         columns={"g1": "HIGH_LINE_COUNT", "g2": "LOW_LINE_COUNT"}
     )
 
-    # Round the result to one decimal place -- If you use test_result.py to test the results, please uncomment the following two lines.
+    # Round the result to one decimal
+    # If you use test_result.py to test the results, please uncomment the following two lines.
     # total["HIGH_LINE_COUNT"] = total["HIGH_LINE_COUNT"].astype(float).round(1)
     # total["LOW_LINE_COUNT"] = total["LOW_LINE_COUNT"].astype(float).round(1)
 
-
     return total
 
 
@@ -730,14 +727,11 @@ def q13(root: str, storage_options: Dict):
     count_df = c_o_merged.groupby(["C_CUSTKEY"], as_index=False).agg(
         C_COUNT=pd.NamedAgg(column="O_ORDERKEY", aggfunc="count")
     )
-
     total = count_df.groupby(["C_COUNT"], as_index=False).size()
-    # [change 3] for TypeError: Series.sort_values() got an unexpected keyword argument 'by'
-    # the error is caused here: in cuDF,DataFrameGroupBy.size() Return the size of each group. https://docs.rapids.ai/api/cudf/stable/user_guide/api_docs/api/cudf.core.groupby.groupby.dataframegroupby.size/#
-    # while in pandas, DataFrameGroupBy.size() Returns DataFrame or Series, Number of rows in each group as a Series if as_index is True or a DataFrame if as_index is False. https://pandas.pydata.org/docs/reference/api/pandas.core.groupby.DataFrameGroupBy.size.html#pandas.core.groupby.DataFrameGroupBy.size
+
+    # [DIFF] groupby.agg is a `Series` and convert `Series` to `DataFrame`
     total = total.reset_index(name='size')
     total.columns = ["C_COUNT", "CUSTDIST"]
-
     total = total.sort_values(
         by=["CUSTDIST", "C_COUNT"],
         ascending=[False, False],
@@ -811,18 +805,18 @@ def q16(root: str, storage_options: Dict):
     partsupp = load_partsupp(root, storage_options)
     supplier = load_supplier(root, storage_options)
 
-    BRAND = "Brand#45"
-    TYPE = "MEDIUM POLISHED"
-    SIZE_LIST = [49, 14, 23, 45, 19, 3, 36, 9]
+    brand = "Brand#45"
+    p_type = "MEDIUM POLISHED"
+    size_list = [49, 14, 23, 45, 19, 3, 36, 9]
 
     # Merge part and partsupp DataFrames
     merged_df = pd.merge(part, partsupp, left_on="P_PARTKEY", right_on="PS_PARTKEY", how="inner")
 
     # Apply filters
     filtered_df = merged_df[
-        (merged_df["P_BRAND"] != BRAND) &
-        (~merged_df["P_TYPE"].str.startswith(TYPE)) &
-        (merged_df["P_SIZE"].isin(SIZE_LIST))
+        (merged_df["P_BRAND"] != brand) &
+        (~merged_df["P_TYPE"].str.startswith(p_type)) &
+        (merged_df["P_SIZE"].isin(size_list))
     ]
 
     # Exclude unwanted suppliers
@@ -887,7 +881,7 @@ def q18(root: str, storage_options: Dict):
     total = gb2.sort_values(["O_TOTALPRICE", "O_ORDERDATE"], ascending=[False, True])
     total = total.head(100)
 
-    # [change 2]Convert cudf DataFrame to Pandas DataFrame and format timestamp
+    # [DIFF] Convert cudf DataFrame to Pandas DataFrame and format timestamp
     total["O_ORDERDATE"] = pd.to_datetime(total["O_ORDERDATE"]).dt.strftime("%Y-%m-%d")
 
     return total
@@ -959,42 +953,42 @@ def q19(root: str, storage_options: Dict):
     jn = flineitem.merge(fpart, left_on="L_PARTKEY", right_on="P_PARTKEY")
     jnsel = (
         (
-        (jn.P_BRAND == brand1)
-        & (
-            (jn.P_CONTAINER == "SM BOX")
-            | (jn.P_CONTAINER == "SM CASE")
-            | (jn.P_CONTAINER == "SM PACK")
-            | (jn.P_CONTAINER == "SM PKG")
-        )
-        & (jn.L_QUANTITY >= quantity1)
-        & (jn.L_QUANTITY <= quantity1 + 10)
-        & (jn.P_SIZE <= 5)
-        )
-        |
-        (
-        (jn.P_BRAND == brand2)
-        & (
-            (jn.P_CONTAINER == "MED BAG")
-            | (jn.P_CONTAINER == "MED BOX")
-            | (jn.P_CONTAINER == "MED PACK")
-            | (jn.P_CONTAINER == "MED PKG")
-        )
-        & (jn.L_QUANTITY >= quantity2)
-        & (jn.L_QUANTITY <= quantity2 + 10)
-        & (jn.P_SIZE <= 10)
+            (jn.P_BRAND == brand1)
+            & (
+                (jn.P_CONTAINER == "SM BOX")
+                | (jn.P_CONTAINER == "SM CASE")
+                | (jn.P_CONTAINER == "SM PACK")
+                | (jn.P_CONTAINER == "SM PKG")
+            )
+            & (jn.L_QUANTITY >= quantity1)
+            & (jn.L_QUANTITY <= quantity1 + 10)
+            & (jn.P_SIZE <= 5)
+            )
+            |
+            (
+            (jn.P_BRAND == brand2)
+            & (
+                (jn.P_CONTAINER == "MED BAG")
+                | (jn.P_CONTAINER == "MED BOX")
+                | (jn.P_CONTAINER == "MED PACK")
+                | (jn.P_CONTAINER == "MED PKG")
+            )
+            & (jn.L_QUANTITY >= quantity2)
+            & (jn.L_QUANTITY <= quantity2 + 10)
+            & (jn.P_SIZE <= 10)
         )
         |
         (
-         (jn.P_BRAND == brand3)
-        & (
-            (jn.P_CONTAINER == "LG BOX")
-            | (jn.P_CONTAINER == "LG CASE")
-            | (jn.P_CONTAINER == "LG PACK")
-            | (jn.P_CONTAINER == "LG PKG")
-        )
-        & (jn.L_QUANTITY >= quantity3)
-        & (jn.L_QUANTITY <= quantity3 + 10)
-        & (jn.P_SIZE <= 15)
+            (jn.P_BRAND == brand3)
+            & (
+                (jn.P_CONTAINER == "LG BOX")
+                | (jn.P_CONTAINER == "LG CASE")
+                | (jn.P_CONTAINER == "LG PACK")
+                | (jn.P_CONTAINER == "LG PKG")
+            )
+            & (jn.L_QUANTITY >= quantity3)
+            & (jn.L_QUANTITY <= quantity3 + 10)
+            & (jn.P_SIZE <= 15)
         )
     )
     jn = jn[jnsel]
@@ -1104,7 +1098,7 @@ def q21(root: str, storage_options: Dict):
     )
     total = total.loc[:, ["S_NAME"]]
     total = total.groupby("S_NAME", as_index=False).size()
-    # [change 4] add reset_index for the same error in q13
+    # [DIFF] groupby.add `Series` to `DataFrame`
     total = total.reset_index(name='size')
     total.columns = ["S_NAME", "NUMWAIT"]
     total = total.sort_values(by=["NUMWAIT", "S_NAME"], ascending=[False, True])
@@ -1145,7 +1139,7 @@ def q22(root: str, storage_options: Dict):
     )
     customer_selected = customer_selected.loc[:, ["CNTRYCODE", "C_ACCTBAL"]]
     agg1 = customer_selected.groupby(["CNTRYCODE"], as_index=False).size()
-    # [change 5] add reset_index for the same error in q13
+    # [DIFF] groupby.add `Series` to `DataFrame`
     agg1 = agg1.reset_index(name='size')
 
     agg1.columns = ["CNTRYCODE", "NUMCUST"]