From 2600b41cd2ce4942048e57cbca96bd5d0ac4689b Mon Sep 17 00:00:00 2001 From: Andrzej Kostanski Date: Fri, 24 Jan 2020 13:17:41 +0100 Subject: [PATCH 1/2] Using --schema flag for having dump schema level granularity like in pg_dump and introduce --sample_schema overtaking old functionality --- pg_sample | 36 +++++++++++++++++++++++------------- 1 file changed, 23 insertions(+), 13 deletions(-) diff --git a/pg_sample b/pg_sample index fda53cc..83a56af 100755 --- a/pg_sample +++ b/pg_sample @@ -112,7 +112,12 @@ Rules are applied in order with the first match taking precedence. Randomize the rows initially selected from each table. May significantly increase the running time of the script. -=item B<--schema=>I +=item B<--schema=>n + +The name of shcema to which the sampling will be limited. +If not specified all schemas will be sampled. + +=item B<--sample-schema=>I The schema name to use for the sample database (defaults to _pg_sample). @@ -364,7 +369,7 @@ sub sample_table ($) { my $table = shift; my $sample_table = join '_', $table->schema || 'public', $table->table; - return Table->new($opt{schema}, $sample_table); + return Table->new($opt{sample_schema}, $sample_table); } sub notice (@) { @@ -377,7 +382,8 @@ sub notice (@) { db_port => '', keep => 0, random => 0, - schema => '_pg_sample', + schema => undef, + sample_schema => '_pg_sample', verbose => 0, ); @@ -395,12 +401,14 @@ GetOptions(\%opt, "keep", "limit=s@", "random", + "sample_schema=s", "schema=s", "trace", "verbose|v", "version|V", ); + if ($opt{version}) { print "$VERSION\n"; exit 0; @@ -427,7 +435,8 @@ my $dbh = connect_db(%opt) or croak "unable to connect to database"; my $pg_version = pg_version; -if ($opt{schema} eq 'public') { +if ($opt{sample_schema} eq 'public') { + # TODO worth check if smbd won't destroy one of exisitin db schemas !!!! die "Error: refusing to use 'public' schema for sampling.\n"; } @@ -435,16 +444,16 @@ my ($schema_oid) = $dbh->selectrow_array(qq{ SELECT oid FROM pg_catalog.pg_namespace WHERE nspname = ? -}, undef, $opt{schema}); +}, undef, $opt{sample_schema}); if ($schema_oid && !$opt{force}) { - die "Error: schema '$opt{schema}' already exists. " . + die "Error: schema '$opt{sample_schema}' already exists. " . "Use --force option to overwrite.\n"; } $dbh->do(qq{ SET client_min_messages = warning }); # suppress notice messages if ($opt{force}) { - notice "Dropping sample schema $opt{schema}\n"; - $dbh->do(qq{ DROP SCHEMA IF EXISTS $opt{schema} CASCADE }); + notice "Dropping sample schema $opt{sample_schema}\n"; + $dbh->do(qq{ DROP SCHEMA IF EXISTS $opt{sample_schema} CASCADE }); } if ($opt{file}) { @@ -474,8 +483,8 @@ unless ($opt{'data-only'}) { # If running PostgreSQL 9.1 or later, use UNLOGGED tables my $unlogged = $pg_version >= version->declare('9.1') ? 'UNLOGGED' : ''; -notice "Creating sample schema $opt{schema}\n"; -$dbh->do(qq{ CREATE SCHEMA $opt{schema} }); +notice "Creating sample schema $opt{sample_schema}\n"; +$dbh->do(qq{ CREATE SCHEMA $opt{sample_schema} }); my $created_schema = 1; # keep track that we actually did it; see END block # parse limit rules @@ -500,6 +509,7 @@ my $sth = $dbh->table_info(undef, undef, undef, 'TABLE'); while (my $row = lower_keys($sth->fetchrow_hashref)) { next unless uc $row->{table_type} eq 'TABLE'; # skip SYSTEM TABLE values next if $row->{table_schem} eq 'information_schema'; # special pg schema + next if ($opt{schema}) && $row->{table_schem} ne $opt{schema}; my $sname = $row->{pg_schema} || unquote_identifier($row->{TABLE_SCHEM}) or die "no pg_schema or TABLE_SCHEM value?!"; @@ -589,7 +599,7 @@ foreach my $fk (@fks) { my ($fk_table, $table, @pairs) = @$fk; my $sample_fk_table = $sample_tables{ $fk_table }; - my $idx_name = $dbh->quote_identifier($opt{schema} . '_idx' . ++$idx); + my $idx_name = $dbh->quote_identifier($opt{sample_schema} . '_idx' . ++$idx); my $fk_cols = join ', ', map { $_->[0] } @pairs; $dbh->do(qq{ CREATE INDEX $idx_name ON $sample_fk_table ($fk_cols) }); } @@ -715,8 +725,8 @@ print "\n"; END { # remove sample tables unless requested not to if ($created_schema && !$opt{keep}) { - notice "Dropping sample schema $opt{schema}\n"; - $dbh->do("DROP SCHEMA $opt{schema} CASCADE"); + notice "Dropping sample schema $opt{sample_schema}\n"; + $dbh->do("DROP SCHEMA $opt{sample_schema} CASCADE"); } notice "Done.\n"; From fd8cae3cad8090b9463affe7895b353a7600439f Mon Sep 17 00:00:00 2001 From: Andrzej Kostanski Date: Fri, 31 Jan 2020 11:42:28 +0100 Subject: [PATCH 2/2] improved per schema filtering, make COPY less error prone changed DELIMITER, QUOTE and ESCAPE for better handling of complicated data type fields (JSON in particular) added explicitly the table collumns listed in COPY FROM to handle situation when the order of collumns differs between source and destination databases added flag --password to explicitly ask for password when pg_dump needed (i.e each run without --data-only flag) commented out standard_conforming_strings = off so this will be set to default value, which is on and is needed for import with \t delimiter introduced the requirement that --sample_schema if specified must include _pg_sample in name to avoid risk of accidentally delete when used with --force --- pg_sample | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/pg_sample b/pg_sample index 83a56af..afa8828 100755 --- a/pg_sample +++ b/pg_sample @@ -435,9 +435,9 @@ my $dbh = connect_db(%opt) or croak "unable to connect to database"; my $pg_version = pg_version; -if ($opt{sample_schema} eq 'public') { - # TODO worth check if smbd won't destroy one of exisitin db schemas !!!! - die "Error: refusing to use 'public' schema for sampling.\n"; +if (index($opt{sample_schema}, "_pg_sample") == -1) { + # ``reduce`` the risk of overwrite or remove (when with --force) by accident existing db schemas + die "Error: --sample_schema have to contain '_pg_sample' suffix"; } my ($schema_oid) = $dbh->selectrow_array(qq{ @@ -476,7 +476,11 @@ unless ($opt{'data-only'}) { local $ENV{PGPORT} = $opt{db_port}; local $ENV{PGCLIENTENCODING} = $opt{encoding}; - my $cmd = "pg_dump --schema-only"; + my $cmd = "pg_dump --schema-only --password"; + if ($opt{schema}) { + $cmd = $cmd . " --schema=$opt{schema}"; + } + system($cmd) == 0 or croak "command '$cmd' failed: $?"; } @@ -680,7 +684,7 @@ foreach my $name (keys %seq) { print <selectrow_array("SELECT count(*) FROM $sample_table"); notice "Exporting data from $sample_table ($count)\n"; } - print "COPY $table FROM stdin;\n"; - $dbh->do(qq{ COPY $sample_table TO STDOUT }); + my ($schema_name, $table_name) = split('\.', $table, 2); + my $cleaned_table_name = substr $table_name, 1, -1; + my $cleaned_schema_name = substr $schema_name, 1, -1; + my ($q) = "SELECT string_agg(quote_ident(column_name), ',') FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_NAME = '$cleaned_table_name' AND TABLE_SCHEMA = '$cleaned_schema_name'"; + my ($column_names_to_keep_order) = $dbh->selectrow_array($q); + print "COPY $table ($column_names_to_keep_order) FROM stdin WITH CSV DELIMITER E'\\t' QUOTE '\b' ESCAPE '\\';\n"; + $dbh->do(qq{ COPY $sample_table TO STDOUT WITH CSV DELIMITER E'\\t' QUOTE '\b' ESCAPE '\\'}); my $buffer = ''; print $buffer while $dbh->pg_getcopydata($buffer) >= 0; print "\\.\n\n";