From 694bc0943ce9825ad9073fe2d98dd86d0da0b5aa Mon Sep 17 00:00:00 2001 From: Amelia <30877272+amsnyder@users.noreply.github.com> Date: Tue, 18 Jan 2022 09:45:24 -0500 Subject: [PATCH 1/5] separate gap analysis csv and plot outputs --- data_exploration/src/gap_analysis.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/data_exploration/src/gap_analysis.py b/data_exploration/src/gap_analysis.py index 27d6b25..cf9b94e 100644 --- a/data_exploration/src/gap_analysis.py +++ b/data_exploration/src/gap_analysis.py @@ -49,7 +49,9 @@ def gap_analysis_calc(source, var_dfs): var_site_gap_df.loc[year, 'n_gaps'] = len(gaps) var_site_gap_df.loc[year, 'gap_median_days'] = gaps.median().days if pd.notna(gaps.median().days) else 0 var_site_gap_df.loc[year, 'gap_max_days'] = gaps.max().days if pd.notna(gaps.max().days) else 0 - var_site_gap_df.to_csv(os.path.join('data_exploration', 'out', f'{source}_{var}_{site}_gap_analysis.csv')) + # make output directory if it doesn't exist + os.makdirs(os.path.join('data_exploration', 'out', 'gap_analysis_csvs'), exist_ok = True) + var_site_gap_df.to_csv(os.path.join('data_exploration', 'out', 'gap_analysis_csvs', f'{source}_{var}_{site}_gap_analysis.csv')) metric_dfs[var][site]= var_site_gap_df return metric_dfs, metrics @@ -68,7 +70,9 @@ def plot_gap_analysis(source, metric_dfs, metrics, site_colors): handles, labels = axs[0].get_legend_handles_labels() fig.legend(handles, labels, bbox_to_anchor=(1.15,0.9), loc='upper right') fig.suptitle(var) - save_path = os.path.join('data_exploration', 'out', f'{source}_{var}_gap_analysis_plot.png') + # make output directory if it doesn't exist + os.makdirs(os.path.join('data_exploration', 'out', 'gap_analysis_plots'), exist_ok = True) + save_path = os.path.join('data_exploration', 'out', 'gap_analysis_plots', f'{source}_{var}_gap_analysis_plot.png') fig.savefig(save_path, bbox_inches = 'tight') def main(): @@ -85,7 +89,8 @@ def main(): config = yaml.safe_load(stream)['gap_analysis.py'] # read in data source we want to do gap analysis for source = config['source'] - os.makdirs('data_exploration/out/', exist_ok = True) + # make output directory if it doesn't exist + os.makdirs(os.path.join('data_exploration', 'out'), exist_ok = True) # fetch site data and compile into nested dictionary of dataframes var_dfs = compile_data(var_names, source) From f7f6eadaa49a3d9650ca04a0750d98fcb90800bc Mon Sep 17 00:00:00 2001 From: Amelia <30877272+amsnyder@users.noreply.github.com> Date: Tue, 18 Jan 2022 09:51:59 -0500 Subject: [PATCH 2/5] fix makedirs --- data_exploration/src/gap_analysis.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/data_exploration/src/gap_analysis.py b/data_exploration/src/gap_analysis.py index cf9b94e..d36a1a1 100644 --- a/data_exploration/src/gap_analysis.py +++ b/data_exploration/src/gap_analysis.py @@ -31,6 +31,7 @@ def gap_analysis_calc(source, var_dfs): gap_template_df = pd.DataFrame(columns=metrics) metric_dfs = {} for var, df in var_dfs.items(): + print(f'calculating metrics for {var}') df.dropna(axis=0, how='all', inplace=True) if df.empty: continue @@ -50,13 +51,14 @@ def gap_analysis_calc(source, var_dfs): var_site_gap_df.loc[year, 'gap_median_days'] = gaps.median().days if pd.notna(gaps.median().days) else 0 var_site_gap_df.loc[year, 'gap_max_days'] = gaps.max().days if pd.notna(gaps.max().days) else 0 # make output directory if it doesn't exist - os.makdirs(os.path.join('data_exploration', 'out', 'gap_analysis_csvs'), exist_ok = True) + os.makedirs(os.path.join('data_exploration', 'out', 'gap_analysis_csvs'), exist_ok = True) var_site_gap_df.to_csv(os.path.join('data_exploration', 'out', 'gap_analysis_csvs', f'{source}_{var}_{site}_gap_analysis.csv')) metric_dfs[var][site]= var_site_gap_df return metric_dfs, metrics def plot_gap_analysis(source, metric_dfs, metrics, site_colors): for var, data_by_site in metric_dfs.items(): + print(f'plotting metrics for {var}') plot_df = pd.DataFrame() fig, axs = plt.subplots(4, sharex=True, figsize=(8,8)) i=0 @@ -71,7 +73,7 @@ def plot_gap_analysis(source, metric_dfs, metrics, site_colors): fig.legend(handles, labels, bbox_to_anchor=(1.15,0.9), loc='upper right') fig.suptitle(var) # make output directory if it doesn't exist - os.makdirs(os.path.join('data_exploration', 'out', 'gap_analysis_plots'), exist_ok = True) + os.makedirs(os.path.join('data_exploration', 'out', 'gap_analysis_plots'), exist_ok = True) save_path = os.path.join('data_exploration', 'out', 'gap_analysis_plots', f'{source}_{var}_gap_analysis_plot.png') fig.savefig(save_path, bbox_inches = 'tight') @@ -90,7 +92,7 @@ def main(): # read in data source we want to do gap analysis for source = config['source'] # make output directory if it doesn't exist - os.makdirs(os.path.join('data_exploration', 'out'), exist_ok = True) + os.makedirs(os.path.join('data_exploration', 'out'), exist_ok = True) # fetch site data and compile into nested dictionary of dataframes var_dfs = compile_data(var_names, source) From 8cd0dead8be3a360615a9c0f932171f60d0bdb9d Mon Sep 17 00:00:00 2001 From: Amelia <30877272+amsnyder@users.noreply.github.com> Date: Tue, 18 Jan 2022 09:54:13 -0500 Subject: [PATCH 3/5] only drop columns with ALL data missing in munge --- 02_munge/src/munge_usgs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/02_munge/src/munge_usgs.py b/02_munge/src/munge_usgs.py index 2f32b6c..c5b1033 100644 --- a/02_munge/src/munge_usgs.py +++ b/02_munge/src/munge_usgs.py @@ -72,7 +72,7 @@ def process_data_to_csv(raw_datafile, params_to_process, params_df, flags_to_dro df.drop(col, axis=1, inplace=True) # drop any columns with no data - df.dropna(axis=1, inplace=True) + df.dropna(axis=1, how=all, inplace=True) # process parameter codes to names df = param_code_to_name(df, params_df) From d163a3848b6a5b2fce67889418e55df602324e59 Mon Sep 17 00:00:00 2001 From: Amelia <30877272+amsnyder@users.noreply.github.com> Date: Tue, 18 Jan 2022 09:54:34 -0500 Subject: [PATCH 4/5] only drop columns with ALL data missing in munge --- 02_munge/src/munge_usgs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/02_munge/src/munge_usgs.py b/02_munge/src/munge_usgs.py index c5b1033..a83a9d9 100644 --- a/02_munge/src/munge_usgs.py +++ b/02_munge/src/munge_usgs.py @@ -72,7 +72,7 @@ def process_data_to_csv(raw_datafile, params_to_process, params_df, flags_to_dro df.drop(col, axis=1, inplace=True) # drop any columns with no data - df.dropna(axis=1, how=all, inplace=True) + df.dropna(axis=1, how='all', inplace=True) # process parameter codes to names df = param_code_to_name(df, params_df) From fc91d294c26347cd9acfe592114111e47939ec95 Mon Sep 17 00:00:00 2001 From: Amelia <30877272+amsnyder@users.noreply.github.com> Date: Tue, 18 Jan 2022 10:01:55 -0500 Subject: [PATCH 5/5] move makedirs outside for loop --- data_exploration/src/gap_analysis.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/data_exploration/src/gap_analysis.py b/data_exploration/src/gap_analysis.py index d36a1a1..a074be6 100644 --- a/data_exploration/src/gap_analysis.py +++ b/data_exploration/src/gap_analysis.py @@ -27,6 +27,9 @@ def compile_data(nwis_var_names, source): return var_dfs def gap_analysis_calc(source, var_dfs): + # make output directory if it doesn't exist + os.makedirs(os.path.join('data_exploration', 'out', 'gap_analysis_csvs'), exist_ok = True) + # define metric names that we will calculate metrics = ['p_coverage', 'n_gaps', 'gap_median_days', 'gap_max_days'] gap_template_df = pd.DataFrame(columns=metrics) metric_dfs = {} @@ -50,13 +53,13 @@ def gap_analysis_calc(source, var_dfs): var_site_gap_df.loc[year, 'n_gaps'] = len(gaps) var_site_gap_df.loc[year, 'gap_median_days'] = gaps.median().days if pd.notna(gaps.median().days) else 0 var_site_gap_df.loc[year, 'gap_max_days'] = gaps.max().days if pd.notna(gaps.max().days) else 0 - # make output directory if it doesn't exist - os.makedirs(os.path.join('data_exploration', 'out', 'gap_analysis_csvs'), exist_ok = True) var_site_gap_df.to_csv(os.path.join('data_exploration', 'out', 'gap_analysis_csvs', f'{source}_{var}_{site}_gap_analysis.csv')) metric_dfs[var][site]= var_site_gap_df return metric_dfs, metrics def plot_gap_analysis(source, metric_dfs, metrics, site_colors): + # make output directory if it doesn't exist + os.makedirs(os.path.join('data_exploration', 'out', 'gap_analysis_plots'), exist_ok = True) for var, data_by_site in metric_dfs.items(): print(f'plotting metrics for {var}') plot_df = pd.DataFrame() @@ -72,8 +75,6 @@ def plot_gap_analysis(source, metric_dfs, metrics, site_colors): handles, labels = axs[0].get_legend_handles_labels() fig.legend(handles, labels, bbox_to_anchor=(1.15,0.9), loc='upper right') fig.suptitle(var) - # make output directory if it doesn't exist - os.makedirs(os.path.join('data_exploration', 'out', 'gap_analysis_plots'), exist_ok = True) save_path = os.path.join('data_exploration', 'out', 'gap_analysis_plots', f'{source}_{var}_gap_analysis_plot.png') fig.savefig(save_path, bbox_inches = 'tight')