updated the notebooks to use a new synthetic data for demonstration; …

…addressed PR comments #208 and #230 (#231)
awslabs · Aug 21, 2024 · 90e69f9 · 90e69f9
1 parent 56b34e6
commit 90e69f9
Show file tree

Hide file tree

Showing 6 changed files with 2,621 additions and 836 deletions.
diff --git a/tutorials/analyzers.ipynb b/tutorials/analyzers.ipynb
@@ -6,14 +6,116 @@
    "source": [
     "# Analyzers Basic Tutorial\n",
     "\n",
+    "__Updated June 2024 to use a new dataset__\n",
+    "\n",
     "This Jupyter notebook will give a basic tutorial on how to use PyDeequ's Analyzers module."
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 1,
-   "metadata": {},
+   "metadata": {
+    "tags": []
+   },
    "outputs": [],
+   "source": [
+    "import os\n",
+    "# indicate your Spark version, here we use Spark 3.5 with pydeequ 1.4.0\n",
+    "os.environ[\"SPARK_VERSION\"] = '3.5'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      ":: loading settings :: url = jar:file:/home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages/pyspark/jars/ivy-2.5.0.jar!/org/apache/ivy/core/settings/ivysettings.xml\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Ivy Default Cache set to: /home/ec2-user/.ivy2/cache\n",
+      "The jars for the packages stored in: /home/ec2-user/.ivy2/jars\n",
+      "com.amazon.deequ#deequ added as a dependency\n",
+      ":: resolving dependencies :: org.apache.spark#spark-submit-parent-23421fea-77b3-4d69-9251-54adf6371fd9;1.0\n",
+      "\tconfs: [default]\n",
+      "\tfound com.amazon.deequ#deequ;2.0.3-spark-3.3 in central\n",
+      "\tfound org.scala-lang#scala-reflect;2.12.10 in central\n",
+      "\tfound org.scalanlp#breeze_2.12;0.13.2 in central\n",
+      "\tfound org.scalanlp#breeze-macros_2.12;0.13.2 in central\n",
+      "\tfound com.github.fommil.netlib#core;1.1.2 in central\n",
+      "\tfound net.sf.opencsv#opencsv;2.3 in central\n",
+      "\tfound com.github.rwl#jtransforms;2.4.0 in central\n",
+      "\tfound junit#junit;4.8.2 in central\n",
+      "\tfound org.apache.commons#commons-math3;3.2 in central\n",
+      "\tfound org.spire-math#spire_2.12;0.13.0 in central\n",
+      "\tfound org.spire-math#spire-macros_2.12;0.13.0 in central\n",
+      "\tfound org.typelevel#machinist_2.12;0.6.1 in central\n",
+      "\tfound com.chuusai#shapeless_2.12;2.3.2 in central\n",
+      "\tfound org.typelevel#macro-compat_2.12;1.1.1 in central\n",
+      "\tfound org.slf4j#slf4j-api;1.7.5 in central\n",
+      ":: resolution report :: resolve 435ms :: artifacts dl 12ms\n",
+      "\t:: modules in use:\n",
+      "\tcom.amazon.deequ#deequ;2.0.3-spark-3.3 from central in [default]\n",
+      "\tcom.chuusai#shapeless_2.12;2.3.2 from central in [default]\n",
+      "\tcom.github.fommil.netlib#core;1.1.2 from central in [default]\n",
+      "\tcom.github.rwl#jtransforms;2.4.0 from central in [default]\n",
+      "\tjunit#junit;4.8.2 from central in [default]\n",
+      "\tnet.sf.opencsv#opencsv;2.3 from central in [default]\n",
+      "\torg.apache.commons#commons-math3;3.2 from central in [default]\n",
+      "\torg.scala-lang#scala-reflect;2.12.10 from central in [default]\n",
+      "\torg.scalanlp#breeze-macros_2.12;0.13.2 from central in [default]\n",
+      "\torg.scalanlp#breeze_2.12;0.13.2 from central in [default]\n",
+      "\torg.slf4j#slf4j-api;1.7.5 from central in [default]\n",
+      "\torg.spire-math#spire-macros_2.12;0.13.0 from central in [default]\n",
+      "\torg.spire-math#spire_2.12;0.13.0 from central in [default]\n",
+      "\torg.typelevel#machinist_2.12;0.6.1 from central in [default]\n",
+      "\torg.typelevel#macro-compat_2.12;1.1.1 from central in [default]\n",
+      "\t:: evicted modules:\n",
+      "\torg.scala-lang#scala-reflect;2.12.1 by [org.scala-lang#scala-reflect;2.12.10] in [default]\n",
+      "\torg.scala-lang#scala-reflect;2.12.0 by [org.scala-lang#scala-reflect;2.12.10] in [default]\n",
+      "\t---------------------------------------------------------------------\n",
+      "\t|                  |            modules            ||   artifacts   |\n",
+      "\t|       conf       | number| search|dwnlded|evicted|| number|dwnlded|\n",
+      "\t---------------------------------------------------------------------\n",
+      "\t|      default     |   17  |   0   |   0   |   2   ||   15  |   0   |\n",
+      "\t---------------------------------------------------------------------\n",
+      ":: retrieving :: org.apache.spark#spark-submit-parent-23421fea-77b3-4d69-9251-54adf6371fd9\n",
+      "\tconfs: [default]\n",
+      "\t0 artifacts copied, 15 already retrieved (0kB/9ms)\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "24/06/14 23:25:58 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Setting default log level to \"WARN\".\n",
+      "To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "24/06/14 23:25:59 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.\n"
+     ]
+    }
+   ],
    "source": [
     "from pyspark.sql import SparkSession, Row, DataFrame\n",
     "import json\n",
@@ -36,14 +138,30 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "### We will be using the Amazon Product Reviews dataset -- specifically the Electronics subset. "
+    "### We will be using the synthetic reviews dataset for Electronics products"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {},
+   "execution_count": 3,
+   "metadata": {
+    "tags": []
+   },
    "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "24/06/14 23:26:01 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "                                                                                \r"
+     ]
+    },
     {
      "name": "stdout",
      "output_type": "stream",
@@ -53,32 +171,46 @@
       " |-- customer_id: string (nullable = true)\n",
       " |-- review_id: string (nullable = true)\n",
       " |-- product_id: string (nullable = true)\n",
-      " |-- product_parent: string (nullable = true)\n",
       " |-- product_title: string (nullable = true)\n",
-      " |-- star_rating: integer (nullable = true)\n",
-      " |-- helpful_votes: integer (nullable = true)\n",
-      " |-- total_votes: integer (nullable = true)\n",
-      " |-- vine: string (nullable = true)\n",
-      " |-- verified_purchase: string (nullable = true)\n",
+      " |-- star_rating: long (nullable = true)\n",
+      " |-- helpful_votes: long (nullable = true)\n",
+      " |-- total_votes: long (nullable = true)\n",
+      " |-- insight: string (nullable = true)\n",
       " |-- review_headline: string (nullable = true)\n",
       " |-- review_body: string (nullable = true)\n",
-      " |-- review_date: date (nullable = true)\n",
-      " |-- year: integer (nullable = true)\n",
+      " |-- review_date: timestamp (nullable = true)\n",
+      " |-- review_year: long (nullable = true)\n",
       "\n"
      ]
     }
    ],
    "source": [
-    "df = spark.read.parquet(\"s3a://amazon-reviews-pds/parquet/product_category=Electronics/\")\n",
+    "df = spark.read.parquet(\"s3a://aws-bigdata-blog/generated_synthetic_reviews/data/product_category=Electronics/\")\n",
     "\n",
     "df.printSchema()"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {},
+   "execution_count": 4,
+   "metadata": {
+    "tags": []
+   },
    "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "24/06/14 23:26:06 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "                                                                                \r"
+     ]
+    },
     {
      "name": "stdout",
      "output_type": "stream",
@@ -87,15 +219,23 @@
       "|     entity|            instance|               name|               value|\n",
       "+-----------+--------------------+-------------------+--------------------+\n",
       "|     Column|           review_id|       Completeness|                 1.0|\n",
-      "|     Column|           review_id|ApproxCountDistinct|           3010972.0|\n",
-      "|Mutlicolumn|total_votes,star_...|        Correlation|-0.03451097996538765|\n",
-      "|    Dataset|                   *|               Size|           3120938.0|\n",
-      "|     Column|         star_rating|               Mean|   4.036143941340712|\n",
-      "|     Column|     top star_rating|         Compliance|  0.7494070692849394|\n",
-      "|Mutlicolumn|total_votes,helpf...|        Correlation|  0.9936463809903863|\n",
+      "|     Column|           review_id|ApproxCountDistinct|           3160409.0|\n",
+      "|Mutlicolumn|total_votes,star_...|        Correlation|-7.38808965018615...|\n",
+      "|    Dataset|                   *|               Size|           3010972.0|\n",
+      "|     Column|         star_rating|               Mean|  3.9999973430506826|\n",
+      "|     Column|     top star_rating|         Compliance|  0.7499993357626706|\n",
+      "|Mutlicolumn|total_votes,helpf...|        Correlation|  0.9817922803462663|\n",
       "+-----------+--------------------+-------------------+--------------------+\n",
       "\n"
      ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages/pyspark/sql/dataframe.py:127: UserWarning: DataFrame constructor is internal. Do not directly use it.\n",
+      "  warnings.warn(\"DataFrame constructor is internal. Do not directly use it.\")\n"
+     ]
     }
    ],
    "source": [
@@ -119,7 +259,9 @@
   {
    "cell_type": "code",
    "execution_count": 5,
-   "metadata": {},
+   "metadata": {
+    "tags": []
+   },
    "outputs": [
     {
      "data": {
@@ -161,42 +303,42 @@
        "      <td>Column</td>\n",
        "      <td>review_id</td>\n",
        "      <td>ApproxCountDistinct</td>\n",
-       "      <td>3.010972e+06</td>\n",
+       "      <td>3.160409e+06</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2</th>\n",
        "      <td>Mutlicolumn</td>\n",
        "      <td>total_votes,star_rating</td>\n",
        "      <td>Correlation</td>\n",
-       "      <td>-3.451098e-02</td>\n",
+       "      <td>-7.388090e-04</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>3</th>\n",
        "      <td>Dataset</td>\n",
        "      <td>*</td>\n",
        "      <td>Size</td>\n",
-       "      <td>3.120938e+06</td>\n",
+       "      <td>3.010972e+06</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>4</th>\n",
        "      <td>Column</td>\n",
        "      <td>star_rating</td>\n",
        "      <td>Mean</td>\n",
-       "      <td>4.036144e+00</td>\n",
+       "      <td>3.999997e+00</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>5</th>\n",
        "      <td>Column</td>\n",
        "      <td>top star_rating</td>\n",
        "      <td>Compliance</td>\n",
-       "      <td>7.494071e-01</td>\n",
+       "      <td>7.499993e-01</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>6</th>\n",
        "      <td>Mutlicolumn</td>\n",
        "      <td>total_votes,helpful_votes</td>\n",
        "      <td>Correlation</td>\n",
-       "      <td>9.936464e-01</td>\n",
+       "      <td>9.817923e-01</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
@@ -205,12 +347,12 @@
       "text/plain": [
        "        entity                   instance                 name         value\n",
        "0       Column                  review_id         Completeness  1.000000e+00\n",
-       "1       Column                  review_id  ApproxCountDistinct  3.010972e+06\n",
-       "2  Mutlicolumn    total_votes,star_rating          Correlation -3.451098e-02\n",
-       "3      Dataset                          *                 Size  3.120938e+06\n",
-       "4       Column                star_rating                 Mean  4.036144e+00\n",
-       "5       Column            top star_rating           Compliance  7.494071e-01\n",
-       "6  Mutlicolumn  total_votes,helpful_votes          Correlation  9.936464e-01"
+       "1       Column                  review_id  ApproxCountDistinct  3.160409e+06\n",
+       "2  Mutlicolumn    total_votes,star_rating          Correlation -7.388090e-04\n",
+       "3      Dataset                          *                 Size  3.010972e+06\n",
+       "4       Column                star_rating                 Mean  3.999997e+00\n",
+       "5       Column            top star_rating           Compliance  7.499993e-01\n",
+       "6  Mutlicolumn  total_votes,helpful_votes          Correlation  9.817923e-01"
       ]
      },
      "execution_count": 5,
@@ -247,7 +389,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.6.10"
+   "version": "3.10.14"
   }
  },
  "nbformat": 4,