Skip to content

Commit

Permalink
updated the notebooks to use a new synthetic data for demonstration; …
Browse files Browse the repository at this point in the history
…addressed PR comments #208 and #230 (#231)
  • Loading branch information
komashk authored Aug 21, 2024
1 parent 56b34e6 commit 90e69f9
Show file tree
Hide file tree
Showing 6 changed files with 2,621 additions and 836 deletions.
212 changes: 177 additions & 35 deletions tutorials/analyzers.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,116 @@
"source": [
"# Analyzers Basic Tutorial\n",
"\n",
"__Updated June 2024 to use a new dataset__\n",
"\n",
"This Jupyter notebook will give a basic tutorial on how to use PyDeequ's Analyzers module."
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"import os\n",
"# indicate your Spark version, here we use Spark 3.5 with pydeequ 1.4.0\n",
"os.environ[\"SPARK_VERSION\"] = '3.5'"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
":: loading settings :: url = jar:file:/home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages/pyspark/jars/ivy-2.5.0.jar!/org/apache/ivy/core/settings/ivysettings.xml\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Ivy Default Cache set to: /home/ec2-user/.ivy2/cache\n",
"The jars for the packages stored in: /home/ec2-user/.ivy2/jars\n",
"com.amazon.deequ#deequ added as a dependency\n",
":: resolving dependencies :: org.apache.spark#spark-submit-parent-23421fea-77b3-4d69-9251-54adf6371fd9;1.0\n",
"\tconfs: [default]\n",
"\tfound com.amazon.deequ#deequ;2.0.3-spark-3.3 in central\n",
"\tfound org.scala-lang#scala-reflect;2.12.10 in central\n",
"\tfound org.scalanlp#breeze_2.12;0.13.2 in central\n",
"\tfound org.scalanlp#breeze-macros_2.12;0.13.2 in central\n",
"\tfound com.github.fommil.netlib#core;1.1.2 in central\n",
"\tfound net.sf.opencsv#opencsv;2.3 in central\n",
"\tfound com.github.rwl#jtransforms;2.4.0 in central\n",
"\tfound junit#junit;4.8.2 in central\n",
"\tfound org.apache.commons#commons-math3;3.2 in central\n",
"\tfound org.spire-math#spire_2.12;0.13.0 in central\n",
"\tfound org.spire-math#spire-macros_2.12;0.13.0 in central\n",
"\tfound org.typelevel#machinist_2.12;0.6.1 in central\n",
"\tfound com.chuusai#shapeless_2.12;2.3.2 in central\n",
"\tfound org.typelevel#macro-compat_2.12;1.1.1 in central\n",
"\tfound org.slf4j#slf4j-api;1.7.5 in central\n",
":: resolution report :: resolve 435ms :: artifacts dl 12ms\n",
"\t:: modules in use:\n",
"\tcom.amazon.deequ#deequ;2.0.3-spark-3.3 from central in [default]\n",
"\tcom.chuusai#shapeless_2.12;2.3.2 from central in [default]\n",
"\tcom.github.fommil.netlib#core;1.1.2 from central in [default]\n",
"\tcom.github.rwl#jtransforms;2.4.0 from central in [default]\n",
"\tjunit#junit;4.8.2 from central in [default]\n",
"\tnet.sf.opencsv#opencsv;2.3 from central in [default]\n",
"\torg.apache.commons#commons-math3;3.2 from central in [default]\n",
"\torg.scala-lang#scala-reflect;2.12.10 from central in [default]\n",
"\torg.scalanlp#breeze-macros_2.12;0.13.2 from central in [default]\n",
"\torg.scalanlp#breeze_2.12;0.13.2 from central in [default]\n",
"\torg.slf4j#slf4j-api;1.7.5 from central in [default]\n",
"\torg.spire-math#spire-macros_2.12;0.13.0 from central in [default]\n",
"\torg.spire-math#spire_2.12;0.13.0 from central in [default]\n",
"\torg.typelevel#machinist_2.12;0.6.1 from central in [default]\n",
"\torg.typelevel#macro-compat_2.12;1.1.1 from central in [default]\n",
"\t:: evicted modules:\n",
"\torg.scala-lang#scala-reflect;2.12.1 by [org.scala-lang#scala-reflect;2.12.10] in [default]\n",
"\torg.scala-lang#scala-reflect;2.12.0 by [org.scala-lang#scala-reflect;2.12.10] in [default]\n",
"\t---------------------------------------------------------------------\n",
"\t| | modules || artifacts |\n",
"\t| conf | number| search|dwnlded|evicted|| number|dwnlded|\n",
"\t---------------------------------------------------------------------\n",
"\t| default | 17 | 0 | 0 | 2 || 15 | 0 |\n",
"\t---------------------------------------------------------------------\n",
":: retrieving :: org.apache.spark#spark-submit-parent-23421fea-77b3-4d69-9251-54adf6371fd9\n",
"\tconfs: [default]\n",
"\t0 artifacts copied, 15 already retrieved (0kB/9ms)\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"24/06/14 23:25:58 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Setting default log level to \"WARN\".\n",
"To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"24/06/14 23:25:59 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.\n"
]
}
],
"source": [
"from pyspark.sql import SparkSession, Row, DataFrame\n",
"import json\n",
Expand All @@ -36,14 +138,30 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"### We will be using the Amazon Product Reviews dataset -- specifically the Electronics subset. "
"### We will be using the synthetic reviews dataset for Electronics products"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"execution_count": 3,
"metadata": {
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"24/06/14 23:26:01 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
" \r"
]
},
{
"name": "stdout",
"output_type": "stream",
Expand All @@ -53,32 +171,46 @@
" |-- customer_id: string (nullable = true)\n",
" |-- review_id: string (nullable = true)\n",
" |-- product_id: string (nullable = true)\n",
" |-- product_parent: string (nullable = true)\n",
" |-- product_title: string (nullable = true)\n",
" |-- star_rating: integer (nullable = true)\n",
" |-- helpful_votes: integer (nullable = true)\n",
" |-- total_votes: integer (nullable = true)\n",
" |-- vine: string (nullable = true)\n",
" |-- verified_purchase: string (nullable = true)\n",
" |-- star_rating: long (nullable = true)\n",
" |-- helpful_votes: long (nullable = true)\n",
" |-- total_votes: long (nullable = true)\n",
" |-- insight: string (nullable = true)\n",
" |-- review_headline: string (nullable = true)\n",
" |-- review_body: string (nullable = true)\n",
" |-- review_date: date (nullable = true)\n",
" |-- year: integer (nullable = true)\n",
" |-- review_date: timestamp (nullable = true)\n",
" |-- review_year: long (nullable = true)\n",
"\n"
]
}
],
"source": [
"df = spark.read.parquet(\"s3a://amazon-reviews-pds/parquet/product_category=Electronics/\")\n",
"df = spark.read.parquet(\"s3a://aws-bigdata-blog/generated_synthetic_reviews/data/product_category=Electronics/\")\n",
"\n",
"df.printSchema()"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"execution_count": 4,
"metadata": {
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"24/06/14 23:26:06 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
" \r"
]
},
{
"name": "stdout",
"output_type": "stream",
Expand All @@ -87,15 +219,23 @@
"| entity| instance| name| value|\n",
"+-----------+--------------------+-------------------+--------------------+\n",
"| Column| review_id| Completeness| 1.0|\n",
"| Column| review_id|ApproxCountDistinct| 3010972.0|\n",
"|Mutlicolumn|total_votes,star_...| Correlation|-0.03451097996538765|\n",
"| Dataset| *| Size| 3120938.0|\n",
"| Column| star_rating| Mean| 4.036143941340712|\n",
"| Column| top star_rating| Compliance| 0.7494070692849394|\n",
"|Mutlicolumn|total_votes,helpf...| Correlation| 0.9936463809903863|\n",
"| Column| review_id|ApproxCountDistinct| 3160409.0|\n",
"|Mutlicolumn|total_votes,star_...| Correlation|-7.38808965018615...|\n",
"| Dataset| *| Size| 3010972.0|\n",
"| Column| star_rating| Mean| 3.9999973430506826|\n",
"| Column| top star_rating| Compliance| 0.7499993357626706|\n",
"|Mutlicolumn|total_votes,helpf...| Correlation| 0.9817922803462663|\n",
"+-----------+--------------------+-------------------+--------------------+\n",
"\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages/pyspark/sql/dataframe.py:127: UserWarning: DataFrame constructor is internal. Do not directly use it.\n",
" warnings.warn(\"DataFrame constructor is internal. Do not directly use it.\")\n"
]
}
],
"source": [
Expand All @@ -119,7 +259,9 @@
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"metadata": {
"tags": []
},
"outputs": [
{
"data": {
Expand Down Expand Up @@ -161,42 +303,42 @@
" <td>Column</td>\n",
" <td>review_id</td>\n",
" <td>ApproxCountDistinct</td>\n",
" <td>3.010972e+06</td>\n",
" <td>3.160409e+06</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>Mutlicolumn</td>\n",
" <td>total_votes,star_rating</td>\n",
" <td>Correlation</td>\n",
" <td>-3.451098e-02</td>\n",
" <td>-7.388090e-04</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Dataset</td>\n",
" <td>*</td>\n",
" <td>Size</td>\n",
" <td>3.120938e+06</td>\n",
" <td>3.010972e+06</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Column</td>\n",
" <td>star_rating</td>\n",
" <td>Mean</td>\n",
" <td>4.036144e+00</td>\n",
" <td>3.999997e+00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>Column</td>\n",
" <td>top star_rating</td>\n",
" <td>Compliance</td>\n",
" <td>7.494071e-01</td>\n",
" <td>7.499993e-01</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>Mutlicolumn</td>\n",
" <td>total_votes,helpful_votes</td>\n",
" <td>Correlation</td>\n",
" <td>9.936464e-01</td>\n",
" <td>9.817923e-01</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
Expand All @@ -205,12 +347,12 @@
"text/plain": [
" entity instance name value\n",
"0 Column review_id Completeness 1.000000e+00\n",
"1 Column review_id ApproxCountDistinct 3.010972e+06\n",
"2 Mutlicolumn total_votes,star_rating Correlation -3.451098e-02\n",
"3 Dataset * Size 3.120938e+06\n",
"4 Column star_rating Mean 4.036144e+00\n",
"5 Column top star_rating Compliance 7.494071e-01\n",
"6 Mutlicolumn total_votes,helpful_votes Correlation 9.936464e-01"
"1 Column review_id ApproxCountDistinct 3.160409e+06\n",
"2 Mutlicolumn total_votes,star_rating Correlation -7.388090e-04\n",
"3 Dataset * Size 3.010972e+06\n",
"4 Column star_rating Mean 3.999997e+00\n",
"5 Column top star_rating Compliance 7.499993e-01\n",
"6 Mutlicolumn total_votes,helpful_votes Correlation 9.817923e-01"
]
},
"execution_count": 5,
Expand Down Expand Up @@ -247,7 +389,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.10"
"version": "3.10.14"
}
},
"nbformat": 4,
Expand Down
Loading

0 comments on commit 90e69f9

Please sign in to comment.