From 87b7297357a1703d5be3b9eb01e6ed80adee5665 Mon Sep 17 00:00:00 2001 From: "Aldo \"xoen\" Giambelluca" Date: Tue, 15 Dec 2020 11:37:11 +0000 Subject: [PATCH 1/4] allspark: Updated Spark dependencies (PYSPARK_SUBMIT_ARGS) also removed duplication of `PYSPARK_SUBMIT_ARGS` which was set twice both in the `Dockerfile` and in the `pyspark-s3.py` script. Why? ---- Robin L. testing found an issue related to S3: ``` Py4JJavaError: An error occurred while calling o103.parquet. : java.lang.NumberFormatException: For input string: "100M" [...] ``` [SO Question] leads to an [`hadoop-common` issue] which seems to be resolved in a more recent version. I've updated versions in `PYSPARK_SUBMIT_ARGS` environment variable hoping these newer versions work without problems. But we may need to tweak this further to make things work. [SO Question]: https://stackoverflow.com/questions/60172792/reading-data-from-s3-using-pyspark-throws-java-lang-numberformatexception-for-i [`hadoop-common` issue]: https://issues.apache.org/jira/browse/HADOOP-13680 --- allspark-notebook/Dockerfile | 2 +- allspark-notebook/files/pyspark-s3.py | 3 --- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/allspark-notebook/Dockerfile b/allspark-notebook/Dockerfile index c040092..a378a56 100644 --- a/allspark-notebook/Dockerfile +++ b/allspark-notebook/Dockerfile @@ -7,7 +7,7 @@ USER root ENV PATH=$PATH:$HOME/.local/bin ENV NB_UID=1001 ENV CHOWN_HOME=no -ENV PYSPARK_SUBMIT_ARGS="--packages com.amazonaws:aws-java-sdk:1.7.4,org.apache.hadoop:hadoop-aws:2.7.1 pyspark-shell" +ENV PYSPARK_SUBMIT_ARGS="--packages com.amazonaws:aws-java-sdk:1.11.918,org.apache.hadoop:hadoop-aws:3.3.0 pyspark-shell" COPY ./files/* /tmp/ diff --git a/allspark-notebook/files/pyspark-s3.py b/allspark-notebook/files/pyspark-s3.py index da6f313..6cf08c9 100644 --- a/allspark-notebook/files/pyspark-s3.py +++ b/allspark-notebook/files/pyspark-s3.py @@ -1,8 +1,5 @@ #!/usr/bin/env python -import os -os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages com.amazonaws:aws-java-sdk:1.7.4,org.apache.hadoop:hadoop-aws:2.7.1 pyspark-shell' - import pyspark sc = pyspark.SparkContext("local[*]") From f48fce0f00d38b15f0faffe4027ff988207b3e25 Mon Sep 17 00:00:00 2001 From: "Aldo \"xoen\" Giambelluca" Date: Tue, 15 Dec 2020 14:29:39 +0000 Subject: [PATCH 2/4] allspark: match hadoop-aws/pyspark version Robin L. is getting yet more strange errors which may be caused by this mismatch. Also being more explicit in the `Dockerfile` regarding the version of `pyspark` and added a comment so that this is documented. NOTE: Leaving the version of `com.amazonaws:aws-java-sdk` as it is for the moment but it's not 100% clear whether this needs to be specified or `hadoop-aws` pulls/uses the correct version automatically. --- allspark-notebook/Dockerfile | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/allspark-notebook/Dockerfile b/allspark-notebook/Dockerfile index a378a56..413a6de 100644 --- a/allspark-notebook/Dockerfile +++ b/allspark-notebook/Dockerfile @@ -7,11 +7,13 @@ USER root ENV PATH=$PATH:$HOME/.local/bin ENV NB_UID=1001 ENV CHOWN_HOME=no -ENV PYSPARK_SUBMIT_ARGS="--packages com.amazonaws:aws-java-sdk:1.11.918,org.apache.hadoop:hadoop-aws:3.3.0 pyspark-shell" +# `org.apache.hadoop:hadoop-aws` version must match `pyspark` +# version +ENV PYSPARK_SUBMIT_ARGS="--packages com.amazonaws:aws-java-sdk:1.11.918,org.apache.hadoop:hadoop-aws:3.0.1 pyspark-shell" COPY ./files/* /tmp/ -RUN pip install --upgrade pip boto3 pyspark nbstripout \ +RUN pip install --upgrade pip boto3 pyspark==3.0.1 nbstripout \ && python /tmp/pyspark-s3.py \ && pip install etl-manager==7.3.0 \ && pip install gluejobutils==3.1.1 \ From 65cf5c7dc31e53b1ba2cdd3709d32066ad5dd9f5 Mon Sep 17 00:00:00 2001 From: "Aldo \"xoen\" Giambelluca" Date: Wed, 16 Dec 2020 10:51:11 +0000 Subject: [PATCH 3/4] allspark: removed deprecated SQLContext and updated example The `pyspark-s3.py` script was importing `pyspark.sql.SQLContext` (now deprecated) and declaring an unused variable. Removed to avoid confusion. Also updated the `spark_read_s3.py` test code snippet with something more complete that almost works when copied and pasted (e.g. in a running JupyterLab instance) --- allspark-notebook/files/pyspark-s3.py | 3 --- allspark-notebook/tests/files/spark_read_s3.py | 7 ++++++- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/allspark-notebook/files/pyspark-s3.py b/allspark-notebook/files/pyspark-s3.py index 6cf08c9..74c7f9a 100644 --- a/allspark-notebook/files/pyspark-s3.py +++ b/allspark-notebook/files/pyspark-s3.py @@ -3,8 +3,5 @@ import pyspark sc = pyspark.SparkContext("local[*]") -from pyspark.sql import SQLContext -sqlContext = SQLContext(sc) - hadoopConf = sc._jsc.hadoopConfiguration() hadoopConf.set("fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") diff --git a/allspark-notebook/tests/files/spark_read_s3.py b/allspark-notebook/tests/files/spark_read_s3.py index 25e026a..f2171ee 100644 --- a/allspark-notebook/tests/files/spark_read_s3.py +++ b/allspark-notebook/tests/files/spark_read_s3.py @@ -1,3 +1,8 @@ +from pyspark.context import SparkContext +from pyspark.sql import SparkSession +sc = SparkContext.getOrCreate() +spark = SparkSession(sc) -spark.read_parquet() +df = spark.read.csv("s3a://bucket/path/to/file.csv") +df.limit(10).show() From e6a2c903cb079cf0dc6f1fb35a1b48db5819e3bf Mon Sep 17 00:00:00 2001 From: "Aldo \"xoen\" Giambelluca" Date: Wed, 16 Dec 2020 12:23:34 +0000 Subject: [PATCH 4/4] allspark: added Spark inspec test This will run the Pi Spark test job to see if all is configured correctly. It also checks that `$SPARK_HOME` is set correctly. NOTE: Pi Spark example job returns a very approximate value for Pi. During one run for example it returnes 3.13### so testing output against `3.1` to avoid/reduce inspec test failures. --- .../tests/controls/pyspark_spec.rb | 22 +++++++++++++++++++ 1 file changed, 22 insertions(+) create mode 100644 allspark-notebook/tests/controls/pyspark_spec.rb diff --git a/allspark-notebook/tests/controls/pyspark_spec.rb b/allspark-notebook/tests/controls/pyspark_spec.rb new file mode 100644 index 0000000..a66f919 --- /dev/null +++ b/allspark-notebook/tests/controls/pyspark_spec.rb @@ -0,0 +1,22 @@ +title 'Working pyspark' + +control "pyspark is available" do + impact "high" + title "pyspark should be installed and work" + desc "pyspark installed and can run pyspark jobs" + tag "pyspark" + tag "spark" + + describe command("echo $SPARK_HOME") do + its("stdout.strip") { should eq "/usr/local/spark" } + end + + # Can run on of the spark examples + describe command("python3 /usr/local/spark/examples/src/main/python/pi.py") do + its("exit_status") { should eq 0 } + # Pi calculated using Spark example job is very + # approximate, once it returned 3.13### + # so checking against 3.1 to avoid random failures + its("stdout") { should match /Pi is roughly 3.1/ } + end +end