From 94df352e5a54bcde6a245e186165a2a70467eb93 Mon Sep 17 00:00:00 2001 From: bvolodarskiy Date: Wed, 19 Jul 2023 13:30:54 +0300 Subject: [PATCH 01/10] added unit test for generic_expectations_without_null --- functions/data_test/tests/test_profiling.py | 86 ++++++++++++++++++++- 1 file changed, 84 insertions(+), 2 deletions(-) diff --git a/functions/data_test/tests/test_profiling.py b/functions/data_test/tests/test_profiling.py index 231dabc..9e8055d 100644 --- a/functions/data_test/tests/test_profiling.py +++ b/functions/data_test/tests/test_profiling.py @@ -1,9 +1,56 @@ import pytest from profiling import (add_local_s3_to_stores, - read_gx_config_file) + read_gx_config_file, + generic_expectations_without_null) +import great_expectations as gx +import pandas as pd ENDPOINT_URL = "http://localhost:4566" - +summary_template = { + "n_distinct": 418, + "p_distinct": 1.0, + "is_unique": True, + "n_unique": 418, + "p_unique": 1.0, + "type": "Numeric", + "hashable": True, + "value_counts_without_nan": "892", + "value_counts_index_sorted": "892 1 \nName: PassengerId, Length: 418, dtype: int64", + "ordering": True, + "n_missing": 0, + "n": 418, "p_missing": 0.0, + "count": 418, + "memory_size": 3472, + "n_negative": "0", + "p_negative": 0.0, + "n_infinite": "0", + "n_zeros": 0, + "mean": 1100.5, + "std": 120.81045760473994, + "variance": 14595.166666666666, + "min": "892", + "max": "1309", + "kurtosis": -1.2, + "skewness": 0.0, + "sum": "460009", + "mad": 104.5, + "range": "417", + "5%": 912.85, + "25%": 996.25, + "50%": 1100.5, + "75%": 1204.75, + "95%": 1288.15, + "iqr": 208.5, + "cv": 0.1097777897362471, + "p_zeros": 0.0, + "p_infinite": 0.0, + "monotonic_increase": True, + "monotonic_decrease": False, + "monotonic_increase_strict": True, + "monotonic_decrease_strict": False, + "monotonic": 2, + "histogram": ["[9]"] +} @pytest.mark.parametrize("stores, expected_output", [ ({"store1": {"store_backend": {"type": "s3", "bucket": "my-bucket"}}}, @@ -26,3 +73,38 @@ def test_gx_config_file_path_is_not_none(tmpdir): p.write("config_version: 10.0") config_file = read_gx_config_file(path=p) assert config_file["config_version"] == 10.0 + +@pytest.fixture(autouse=True) +def before_and_after_test(): + df = pd.DataFrame(columns=['PassengerId']) + context_gx = gx.get_context() + datasource = context_gx.sources.add_pandas(name="test") + data_asset = datasource.add_dataframe_asset(name="test") + batch_request = data_asset.build_batch_request(dataframe=df) + context_gx.add_or_update_expectation_suite("test_suite") + batch_empty = context_gx.get_validator( + batch_request=batch_request, + expectation_suite_name="test_suite", + ) + + yield batch_empty + + context_gx.delete_expectation_suite("test_suite") + context_gx.delete_datasource("test") + + +@pytest.mark.parametrize("uniq_percent, applied", [(0.95, True), (0.9, True), (0.1, False)]) +def test_generic_expectations_without_null(uniq_percent, applied, before_and_after_test): + name_expected = "PassengerId" + uniq_percent = eval("uniq_percent") + summary_expected = summary_template + applied = eval("applied") + summary_expected['p_unique'] = uniq_percent + expectation_type = "expect_column_values_to_be_unique" + batch_empty = before_and_after_test + + name, summary, batch = generic_expectations_without_null(name_expected, summary_expected, batch_empty) + + assert name == name_expected + assert "expect_column_to_exist" in str(batch.expectation_suite) + assert (expectation_type in str(batch.expectation_suite)) == applied From da40cbd054dc4bec03651a46697e1509f972f424 Mon Sep 17 00:00:00 2001 From: bvolodarskiy Date: Thu, 20 Jul 2023 14:53:03 +0300 Subject: [PATCH 02/10] added test for expectations_mean and expectations_null --- functions/data_test/data_test/profiling.py | 4 +- functions/data_test/tests/test_profiling.py | 68 +++++++++++++++++---- 2 files changed, 59 insertions(+), 13 deletions(-) diff --git a/functions/data_test/data_test/profiling.py b/functions/data_test/data_test/profiling.py index 00622dc..655deb3 100755 --- a/functions/data_test/data_test/profiling.py +++ b/functions/data_test/data_test/profiling.py @@ -28,7 +28,7 @@ qa_bucket_name = os.environ['BUCKET'] -def generic_expectations_without_null(name, summary, batch, *args): +def expectations_unique(name, summary, batch, *args): batch.expect_column_to_exist(column=name) if summary["p_unique"] >= 0.9: batch.expect_column_values_to_be_unique(column=name) @@ -100,7 +100,7 @@ def __init__(self, typeset, *args, **kwargs): expectations_null, ], "Numeric": [ - generic_expectations_without_null, + expectations_unique, expectations_null, expectation_algorithms.numeric_expectations, expectations_mean, diff --git a/functions/data_test/tests/test_profiling.py b/functions/data_test/tests/test_profiling.py index 9e8055d..907e207 100644 --- a/functions/data_test/tests/test_profiling.py +++ b/functions/data_test/tests/test_profiling.py @@ -1,7 +1,10 @@ import pytest from profiling import (add_local_s3_to_stores, - read_gx_config_file, - generic_expectations_without_null) + read_gx_config_file, + expectations_unique, + expectations_null, + expectations_mean, + calculate_mean) import great_expectations as gx import pandas as pd @@ -18,7 +21,8 @@ "value_counts_index_sorted": "892 1 \nName: PassengerId, Length: 418, dtype: int64", "ordering": True, "n_missing": 0, - "n": 418, "p_missing": 0.0, + "n": 418, + "p_missing": 0.0, "count": 418, "memory_size": 3472, "n_negative": "0", @@ -52,11 +56,12 @@ "histogram": ["[9]"] } + @pytest.mark.parametrize("stores, expected_output", [ ({"store1": {"store_backend": {"type": "s3", "bucket": "my-bucket"}}}, {"store1": {"store_backend": {"type": "s3", "bucket": "my-bucket", "boto3_options": - {"endpoint_url": ENDPOINT_URL}}}}), + {"endpoint_url": ENDPOINT_URL}}}}), ({}, {}) ]) def test_add_local_s3_to_stores(stores, expected_output): @@ -74,6 +79,15 @@ def test_gx_config_file_path_is_not_none(tmpdir): config_file = read_gx_config_file(path=p) assert config_file["config_version"] == 10.0 + +def change_template(params, params_name): + name_expected = "PassengerId" + summary_expected = summary_template + for param, name in zip(params, params_name): + summary_expected[name] = param + return name_expected, summary_expected + + @pytest.fixture(autouse=True) def before_and_after_test(): df = pd.DataFrame(columns=['PassengerId']) @@ -93,18 +107,50 @@ def before_and_after_test(): context_gx.delete_datasource("test") -@pytest.mark.parametrize("uniq_percent, applied", [(0.95, True), (0.9, True), (0.1, False)]) -def test_generic_expectations_without_null(uniq_percent, applied, before_and_after_test): - name_expected = "PassengerId" - uniq_percent = eval("uniq_percent") - summary_expected = summary_template +@pytest.mark.parametrize("p_unique, applied", [(0.95, True), (0.9, True), (0.1, False)]) +def test_expectations_unique(p_unique, applied, before_and_after_test): + p_unique = eval("p_unique") applied = eval("applied") - summary_expected['p_unique'] = uniq_percent + name_expected, summary_expected = change_template([p_unique], ["p_unique"]) expectation_type = "expect_column_values_to_be_unique" batch_empty = before_and_after_test - name, summary, batch = generic_expectations_without_null(name_expected, summary_expected, batch_empty) + name, summary, batch = expectations_unique(name_expected, summary_expected, batch_empty) assert name == name_expected assert "expect_column_to_exist" in str(batch.expectation_suite) assert (expectation_type in str(batch.expectation_suite)) == applied + + +@pytest.mark.parametrize("p_missing, applied", [(0.4, True), (0.2, True), (0.5, False)]) +def test_expectations_null(p_missing, applied, before_and_after_test): + p_missing = eval("p_missing") + applied = eval("applied") + name_expected, summary_expected = change_template([p_missing], ["p_missing"]) + expectation_type = "expect_column_values_to_not_be_null" + batch_empty = before_and_after_test + + name, summary, batch = expectations_null(name_expected, summary_expected, batch_empty) + + assert name == name_expected + assert (expectation_type in str(batch.expectation_suite)) == applied + + +@pytest.mark.parametrize("n,std,mean,max_mean,min_mean", + [(418, 120.81045760473994, 1100.5, 1106.349942307408, 1094.650057692592)]) +def test_expectations_mean(n, std, mean, max_mean, min_mean, before_and_after_test): + n = eval("n") + std = eval("std") + mean = eval("mean") + max_mean_expected = eval("max_mean") + min_mean_expected = eval("min_mean") + name_expected, summary_expected = change_template([n, std, mean], ["n", "std", "mean"]) + expectation_type = "expect_column_mean_to_be_between" + batch_empty = before_and_after_test + + min_mean, max_mean = calculate_mean(summary_expected) + name, summary, batch = expectations_mean(name_expected, summary_expected, batch_empty) + + assert (min_mean == min_mean_expected and max_mean == max_mean_expected) + assert name == name_expected + assert expectation_type in str(batch.expectation_suite) From 4b4c42a381f53293570cadb0ee011819be47bf32 Mon Sep 17 00:00:00 2001 From: bvolodarskiy Date: Mon, 31 Jul 2023 15:58:15 +0300 Subject: [PATCH 03/10] added tests for expectations_z_score,expectations_stdev --- functions/data_test/data_test/profiling.py | 4 +- functions/data_test/tests/test_profiling.py | 45 ++++++++++++++++++++- 2 files changed, 46 insertions(+), 3 deletions(-) diff --git a/functions/data_test/data_test/profiling.py b/functions/data_test/data_test/profiling.py index 655deb3..4102dbe 100755 --- a/functions/data_test/data_test/profiling.py +++ b/functions/data_test/data_test/profiling.py @@ -1,6 +1,6 @@ import json import math - +import numpy as np from ydata_profiling import ProfileReport import os import boto3 @@ -244,7 +244,7 @@ def calculate_z_score(summary): maximum = summary["max"] significance_level = 0.005 threshold = (maximum - mean) / std - if std: + if std and not np.isnan(std): return threshold + significance_level diff --git a/functions/data_test/tests/test_profiling.py b/functions/data_test/tests/test_profiling.py index 907e207..20e6c27 100644 --- a/functions/data_test/tests/test_profiling.py +++ b/functions/data_test/tests/test_profiling.py @@ -1,10 +1,15 @@ import pytest +import numpy as np from profiling import (add_local_s3_to_stores, read_gx_config_file, expectations_unique, expectations_null, expectations_mean, - calculate_mean) + calculate_mean, + calculate_stdev, + expectations_stdev, + calculate_z_score, + expectations_z_score) import great_expectations as gx import pandas as pd @@ -154,3 +159,41 @@ def test_expectations_mean(n, std, mean, max_mean, min_mean, before_and_after_te assert (min_mean == min_mean_expected and max_mean == max_mean_expected) assert name == name_expected assert expectation_type in str(batch.expectation_suite) + +@pytest.mark.parametrize("n,std,max_std,min_std", + [(418, 120.81045760473994, 136.10108739120102, 105.51982781827887)]) +def test_expectations_stdev(n, std, max_std, min_std, before_and_after_test): + n = eval("n") + std = eval("std") + max_std_expected = eval("max_std") + min_std_expected = eval("min_std") + name_expected, summary_expected = change_template([n, std], ["n", "std"]) + expectation_type = "expect_column_stdev_to_be_between" + batch_empty = before_and_after_test + + min_std, max_std = calculate_stdev(summary_expected) + name, summary, batch = expectations_stdev(name_expected, summary_expected, batch_empty) + + assert (min_std == min_std_expected and max_std == max_std_expected) + assert name == name_expected + assert expectation_type in str(batch.expectation_suite) + +@pytest.mark.parametrize("mean,std,max,threshold,applied", + [(418, 120.81045760473994, 1309, 7.380189347557294, True), + (418, np.nan, 1309, None, False)]) +def test_expectations_z_score(mean, std, max, threshold, applied, before_and_after_test): + mean = eval("mean") + std = eval("std") + max = eval("max") + threshold_expected = eval("threshold") + applied = eval("applied") + name_expected, summary_expected = change_template([mean, std, max], ["mean", "std", "max"]) + expectation_type = "expect_column_value_z_scores_to_be_less_than" + batch_empty = before_and_after_test + + threshold = calculate_z_score(summary_expected) + name, summary, batch = expectations_z_score(name_expected, summary_expected, batch_empty) + + assert threshold == threshold_expected + assert name == name_expected + assert (expectation_type in str(batch.expectation_suite)) == applied From ac7bd9017fcd2611a0700d4a586db90b10126756 Mon Sep 17 00:00:00 2001 From: bvolodarskiy Date: Tue, 1 Aug 2023 14:19:19 +0300 Subject: [PATCH 04/10] added tests for expectations_median,expectations_quantile --- functions/data_test/data_test/profiling.py | 27 ++++++----- functions/data_test/tests/test_profiling.py | 53 ++++++++++++++++++++- 2 files changed, 66 insertions(+), 14 deletions(-) diff --git a/functions/data_test/data_test/profiling.py b/functions/data_test/data_test/profiling.py index 4102dbe..0fb59b6 100755 --- a/functions/data_test/data_test/profiling.py +++ b/functions/data_test/data_test/profiling.py @@ -50,9 +50,8 @@ def expectations_mean(name, summary, batch, *args): def expectations_median(name, summary, batch, *args): min_median, max_median = calculate_median(summary) - if min_median and max_median: - batch.expect_column_median_to_be_between( - column=name, min_value=min_median, max_value=max_median) + batch.expect_column_median_to_be_between( + column=name, min_value=min_median, max_value=max_median) return name, summary, batch @@ -64,11 +63,10 @@ def expectations_stdev(name, summary, batch, *args): def expectations_quantile(name, summary, batch, *args): + value_ranges = calculate_q_ranges(summary) q_ranges = { "quantiles": [0.05, 0.25, 0.5, 0.75, 0.95], - "value_ranges": [[summary["5%"], summary["25%"]], [summary["25%"], summary["50%"]], - [summary["50%"], summary["75%"]], [summary["75%"], summary["95%"]], - [summary["95%"], summary["max"]]] + "value_ranges": value_ranges } batch.expect_column_quantile_values_to_be_between( column=name, quantile_ranges=q_ranges) @@ -209,8 +207,6 @@ def calculate_mean(summary): def calculate_median(summary): - min_median = None - max_median = None raw_values = summary["value_counts_index_sorted"] values = [] for key, v in raw_values.items(): @@ -218,10 +214,11 @@ def calculate_median(summary): values.extend(key) q = 0.5 j = int(len(values) * q - 2.58 * math.sqrt(len(values) * q * (1 - q))) - k = int(len(values) * q + 2.58 * math.sqrt(len(values) * q * (1 - q))) - if j < len(values) and k < len(values): - min_median = values[j] - max_median = values[k] + k = int(len(values) * q + 2.58 * math.sqrt(len(values) * q * (1 - q))) - 1 + if j >= 1: + j -= 1 + min_median = values[j] + max_median = values[k] return min_median, max_median @@ -248,6 +245,12 @@ def calculate_z_score(summary): return threshold + significance_level +def calculate_q_ranges(summary): + return [[summary["5%"], summary["25%"]], [summary["25%"], summary["50%"]], + [summary["50%"], summary["75%"]], [summary["75%"], summary["95%"]], + [summary["95%"], summary["max"]]] + + def profile_data(df, suite_name, cloudfront, datasource_root, source_covered, mapping_config, run_name): qa_bucket = s3.Bucket(qa_bucket_name) diff --git a/functions/data_test/tests/test_profiling.py b/functions/data_test/tests/test_profiling.py index 20e6c27..182d939 100644 --- a/functions/data_test/tests/test_profiling.py +++ b/functions/data_test/tests/test_profiling.py @@ -9,7 +9,11 @@ calculate_stdev, expectations_stdev, calculate_z_score, - expectations_z_score) + expectations_z_score, + expectations_quantile, + calculate_q_ranges, + calculate_median, + expectations_median) import great_expectations as gx import pandas as pd @@ -23,7 +27,7 @@ "type": "Numeric", "hashable": True, "value_counts_without_nan": "892", - "value_counts_index_sorted": "892 1 \nName: PassengerId, Length: 418, dtype: int64", + "value_counts_index_sorted": pd.Series({892: 1, 893: 1, 894: 1, 1004: 2, 1500: 1}), "ordering": True, "n_missing": 0, "n": 418, @@ -160,6 +164,7 @@ def test_expectations_mean(n, std, mean, max_mean, min_mean, before_and_after_te assert name == name_expected assert expectation_type in str(batch.expectation_suite) + @pytest.mark.parametrize("n,std,max_std,min_std", [(418, 120.81045760473994, 136.10108739120102, 105.51982781827887)]) def test_expectations_stdev(n, std, max_std, min_std, before_and_after_test): @@ -178,6 +183,7 @@ def test_expectations_stdev(n, std, max_std, min_std, before_and_after_test): assert name == name_expected assert expectation_type in str(batch.expectation_suite) + @pytest.mark.parametrize("mean,std,max,threshold,applied", [(418, 120.81045760473994, 1309, 7.380189347557294, True), (418, np.nan, 1309, None, False)]) @@ -197,3 +203,46 @@ def test_expectations_z_score(mean, std, max, threshold, applied, before_and_aft assert threshold == threshold_expected assert name == name_expected assert (expectation_type in str(batch.expectation_suite)) == applied + + +@pytest.mark.parametrize("q1,q2,q3,q4,q5,q6", + [(912.85, 996.25, 1100.5, 1204.75, 1288.15, 1309)]) +def test_expectations_quantile(q1, q2, q3, q4, q5, q6, before_and_after_test): + q1 = eval("q1") + q2 = eval("q2") + q3 = eval("q3") + q4 = eval("q4") + q5 = eval("q5") + q6 = eval("q6") + expected_ranges = [[q1, q2], [q2, q3], + [q3, q4], [q4, q5], + [q5, q6]] + name_expected, summary_expected = change_template([q1, q2, q3, q4, q5, q6], + ["5%", "25%", "50%", "75%", "95%", "max"]) + expectation_type = "expect_column_quantile_values_to_be_between" + batch_empty = before_and_after_test + + q_ranges = calculate_q_ranges(summary_expected) + name, summary, batch = expectations_quantile(name_expected, summary_expected, batch_empty) + + assert expected_ranges == q_ranges + assert name == name_expected + assert expectation_type in str(batch.expectation_suite) + +@pytest.mark.parametrize("min_median,max_median,value_counts_index_sorted,applied", + [(892, 1500, pd.Series({892: 1, 893: 1, 894: 1, 1004: 2, 1500: 1}), True)]) +def test_expectations_median(min_median, max_median, value_counts_index_sorted, applied, before_and_after_test): + min_median_expected = eval("min_median") + max_median_expected = eval("max_median") + value_counts_index_sorted = eval("value_counts_index_sorted") + applied = eval("applied") + name_expected, summary_expected = change_template([value_counts_index_sorted], ["value_counts_index_sorted"]) + expectation_type = "expect_column_median_to_be_between" + batch_empty = before_and_after_test + + min_median, max_median = calculate_median(summary_expected) + name, summary, batch = expectations_median(name_expected, summary_expected, batch_empty) + + assert (min_median == min_median_expected and max_median == max_median_expected) + assert name == name_expected + assert (expectation_type in str(batch.expectation_suite)) == applied From 8d61800bfc9f7336944be7cdb5064dcc845e757e Mon Sep 17 00:00:00 2001 From: bvolodarskiy Date: Tue, 1 Aug 2023 14:31:18 +0300 Subject: [PATCH 05/10] try to update gx --- functions/data_test/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/functions/data_test/requirements.txt b/functions/data_test/requirements.txt index eab4527..b12002a 100755 --- a/functions/data_test/requirements.txt +++ b/functions/data_test/requirements.txt @@ -1,7 +1,7 @@ boto3==1.26.66 botocore==1.29.66 importlib-metadata==6.0.0 -great-expectations==0.16.14 +great-expectations==0.17.5 s3fs==0.4.2 python-dateutil==2.8.2 fastparquet==0.8.1 From d07491263445d99a2cde84e4de99ed4bd20b7e08 Mon Sep 17 00:00:00 2001 From: bvolodarskiy Date: Tue, 1 Aug 2023 14:40:06 +0300 Subject: [PATCH 06/10] downgrade gx and fixed building of batch asset and request --- functions/data_test/requirements.txt | 2 +- functions/data_test/tests/test_profiling.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/functions/data_test/requirements.txt b/functions/data_test/requirements.txt index b12002a..eab4527 100755 --- a/functions/data_test/requirements.txt +++ b/functions/data_test/requirements.txt @@ -1,7 +1,7 @@ boto3==1.26.66 botocore==1.29.66 importlib-metadata==6.0.0 -great-expectations==0.17.5 +great-expectations==0.16.14 s3fs==0.4.2 python-dateutil==2.8.2 fastparquet==0.8.1 diff --git a/functions/data_test/tests/test_profiling.py b/functions/data_test/tests/test_profiling.py index 182d939..d179704 100644 --- a/functions/data_test/tests/test_profiling.py +++ b/functions/data_test/tests/test_profiling.py @@ -102,8 +102,8 @@ def before_and_after_test(): df = pd.DataFrame(columns=['PassengerId']) context_gx = gx.get_context() datasource = context_gx.sources.add_pandas(name="test") - data_asset = datasource.add_dataframe_asset(name="test") - batch_request = data_asset.build_batch_request(dataframe=df) + data_asset = datasource.add_dataframe_asset(name="test", dataframe=df) + batch_request = data_asset.build_batch_request() context_gx.add_or_update_expectation_suite("test_suite") batch_empty = context_gx.get_validator( batch_request=batch_request, From a325ca4bfe7227ba8b2e02299d8040cb83abca03 Mon Sep 17 00:00:00 2001 From: bvolodarskiy Date: Tue, 1 Aug 2023 14:47:46 +0300 Subject: [PATCH 07/10] parametrize pre/post action --- functions/data_test/tests/test_profiling.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/functions/data_test/tests/test_profiling.py b/functions/data_test/tests/test_profiling.py index d179704..045bc4f 100644 --- a/functions/data_test/tests/test_profiling.py +++ b/functions/data_test/tests/test_profiling.py @@ -16,6 +16,7 @@ expectations_median) import great_expectations as gx import pandas as pd +from datetime import datetime ENDPOINT_URL = "http://localhost:4566" summary_template = { @@ -99,21 +100,23 @@ def change_template(params, params_name): @pytest.fixture(autouse=True) def before_and_after_test(): + df = pd.DataFrame(columns=['PassengerId']) context_gx = gx.get_context() - datasource = context_gx.sources.add_pandas(name="test") - data_asset = datasource.add_dataframe_asset(name="test", dataframe=df) + suite_name = f"test_{datetime.now()}" + datasource = context_gx.sources.add_pandas(name=suite_name) + data_asset = datasource.add_dataframe_asset(name=suite_name, dataframe=df) batch_request = data_asset.build_batch_request() - context_gx.add_or_update_expectation_suite("test_suite") + context_gx.add_or_update_expectation_suite(f"{suite_name}_suite") batch_empty = context_gx.get_validator( batch_request=batch_request, - expectation_suite_name="test_suite", + expectation_suite_name=f"{suite_name}_suite", ) yield batch_empty - context_gx.delete_expectation_suite("test_suite") - context_gx.delete_datasource("test") + context_gx.delete_expectation_suite(f"{suite_name}_suite") + context_gx.delete_datasource(suite_name) @pytest.mark.parametrize("p_unique, applied", [(0.95, True), (0.9, True), (0.1, False)]) From 026c149a723f629b931adc339739c5cda2c15844 Mon Sep 17 00:00:00 2001 From: bvolodarskiy Date: Wed, 2 Aug 2023 10:25:38 +0300 Subject: [PATCH 08/10] try to use ephemeral data context --- functions/data_test/tests/test_profiling.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/functions/data_test/tests/test_profiling.py b/functions/data_test/tests/test_profiling.py index 045bc4f..746774e 100644 --- a/functions/data_test/tests/test_profiling.py +++ b/functions/data_test/tests/test_profiling.py @@ -13,8 +13,10 @@ expectations_quantile, calculate_q_ranges, calculate_median, - expectations_median) + expectations_median, + change_ge_config) import great_expectations as gx +from great_expectations.data_context import EphemeralDataContext import pandas as pd from datetime import datetime @@ -102,7 +104,8 @@ def change_template(params, params_name): def before_and_after_test(): df = pd.DataFrame(columns=['PassengerId']) - context_gx = gx.get_context() + config = change_ge_config("test") + context_gx = EphemeralDataContext(project_config=config) suite_name = f"test_{datetime.now()}" datasource = context_gx.sources.add_pandas(name=suite_name) data_asset = datasource.add_dataframe_asset(name=suite_name, dataframe=df) From de46f78ecd34cfbf76d37414f53c82607d1772d9 Mon Sep 17 00:00:00 2001 From: bvolodarskiy Date: Wed, 2 Aug 2023 10:44:42 +0300 Subject: [PATCH 09/10] try to not to create exp suite --- functions/data_test/tests/test_profiling.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/functions/data_test/tests/test_profiling.py b/functions/data_test/tests/test_profiling.py index 746774e..b767924 100644 --- a/functions/data_test/tests/test_profiling.py +++ b/functions/data_test/tests/test_profiling.py @@ -16,7 +16,6 @@ expectations_median, change_ge_config) import great_expectations as gx -from great_expectations.data_context import EphemeralDataContext import pandas as pd from datetime import datetime @@ -104,21 +103,17 @@ def change_template(params, params_name): def before_and_after_test(): df = pd.DataFrame(columns=['PassengerId']) - config = change_ge_config("test") - context_gx = EphemeralDataContext(project_config=config) + context_gx = gx.get_context() suite_name = f"test_{datetime.now()}" datasource = context_gx.sources.add_pandas(name=suite_name) data_asset = datasource.add_dataframe_asset(name=suite_name, dataframe=df) batch_request = data_asset.build_batch_request() - context_gx.add_or_update_expectation_suite(f"{suite_name}_suite") batch_empty = context_gx.get_validator( batch_request=batch_request, - expectation_suite_name=f"{suite_name}_suite", ) yield batch_empty - context_gx.delete_expectation_suite(f"{suite_name}_suite") context_gx.delete_datasource(suite_name) From 33e82a3b67df9286ac91246384ff256fc5fa65ae Mon Sep 17 00:00:00 2001 From: bvolodarskiy Date: Tue, 15 Aug 2023 12:20:05 +0300 Subject: [PATCH 10/10] removed empty line --- functions/data_test/tests/test_profiling.py | 1 - 1 file changed, 1 deletion(-) diff --git a/functions/data_test/tests/test_profiling.py b/functions/data_test/tests/test_profiling.py index b767924..832acf1 100644 --- a/functions/data_test/tests/test_profiling.py +++ b/functions/data_test/tests/test_profiling.py @@ -101,7 +101,6 @@ def change_template(params, params_name): @pytest.fixture(autouse=True) def before_and_after_test(): - df = pd.DataFrame(columns=['PassengerId']) context_gx = gx.get_context() suite_name = f"test_{datetime.now()}"