Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Using --schema flag for having dump schema level granularity and make COPY less error prone #14

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
57 changes: 39 additions & 18 deletions pg_sample
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,12 @@ Rules are applied in order with the first match taking precedence.
Randomize the rows initially selected from each table. May significantly
increase the running time of the script.

=item B<--schema=>I<name>
=item B<--schema=>n<schema>

The name of shcema to which the sampling will be limited.
If not specified all schemas will be sampled.

=item B<--sample-schema=>I<name>

The schema name to use for the sample database (defaults to _pg_sample).

Expand Down Expand Up @@ -364,7 +369,7 @@ sub sample_table ($) {
my $table = shift;

my $sample_table = join '_', $table->schema || 'public', $table->table;
return Table->new($opt{schema}, $sample_table);
return Table->new($opt{sample_schema}, $sample_table);
}

sub notice (@) {
Expand All @@ -377,7 +382,8 @@ sub notice (@) {
db_port => '',
keep => 0,
random => 0,
schema => '_pg_sample',
schema => undef,
sample_schema => '_pg_sample',
verbose => 0,
);

Expand All @@ -395,12 +401,14 @@ GetOptions(\%opt,
"keep",
"limit=s@",
"random",
"sample_schema=s",
"schema=s",
"trace",
"verbose|v",
"version|V",
);


if ($opt{version}) {
print "$VERSION\n";
exit 0;
Expand All @@ -427,24 +435,25 @@ my $dbh = connect_db(%opt) or croak "unable to connect to database";

my $pg_version = pg_version;

if ($opt{schema} eq 'public') {
die "Error: refusing to use 'public' schema for sampling.\n";
if (index($opt{sample_schema}, "_pg_sample") == -1) {
# ``reduce`` the risk of overwrite or remove (when with --force) by accident existing db schemas
die "Error: --sample_schema have to contain '_pg_sample' suffix";
}

my ($schema_oid) = $dbh->selectrow_array(qq{
SELECT oid
FROM pg_catalog.pg_namespace
WHERE nspname = ?
}, undef, $opt{schema});
}, undef, $opt{sample_schema});
if ($schema_oid && !$opt{force}) {
die "Error: schema '$opt{schema}' already exists. " .
die "Error: schema '$opt{sample_schema}' already exists. " .
"Use --force option to overwrite.\n";
}

$dbh->do(qq{ SET client_min_messages = warning }); # suppress notice messages
if ($opt{force}) {
notice "Dropping sample schema $opt{schema}\n";
$dbh->do(qq{ DROP SCHEMA IF EXISTS $opt{schema} CASCADE });
notice "Dropping sample schema $opt{sample_schema}\n";
$dbh->do(qq{ DROP SCHEMA IF EXISTS $opt{sample_schema} CASCADE });
}

if ($opt{file}) {
Expand All @@ -467,15 +476,19 @@ unless ($opt{'data-only'}) {
local $ENV{PGPORT} = $opt{db_port};
local $ENV{PGCLIENTENCODING} = $opt{encoding};

my $cmd = "pg_dump --schema-only";
my $cmd = "pg_dump --schema-only --password";
if ($opt{schema}) {
$cmd = $cmd . " --schema=$opt{schema}";
}

system($cmd) == 0 or croak "command '$cmd' failed: $?";
}

# If running PostgreSQL 9.1 or later, use UNLOGGED tables
my $unlogged = $pg_version >= version->declare('9.1') ? 'UNLOGGED' : '';

notice "Creating sample schema $opt{schema}\n";
$dbh->do(qq{ CREATE SCHEMA $opt{schema} });
notice "Creating sample schema $opt{sample_schema}\n";
$dbh->do(qq{ CREATE SCHEMA $opt{sample_schema} });
my $created_schema = 1; # keep track that we actually did it; see END block

# parse limit rules
Expand All @@ -500,6 +513,7 @@ my $sth = $dbh->table_info(undef, undef, undef, 'TABLE');
while (my $row = lower_keys($sth->fetchrow_hashref)) {
next unless uc $row->{table_type} eq 'TABLE'; # skip SYSTEM TABLE values
next if $row->{table_schem} eq 'information_schema'; # special pg schema
next if ($opt{schema}) && $row->{table_schem} ne $opt{schema};

my $sname = $row->{pg_schema} || unquote_identifier($row->{TABLE_SCHEM})
or die "no pg_schema or TABLE_SCHEM value?!";
Expand Down Expand Up @@ -589,7 +603,7 @@ foreach my $fk (@fks) {
my ($fk_table, $table, @pairs) = @$fk;

my $sample_fk_table = $sample_tables{ $fk_table };
my $idx_name = $dbh->quote_identifier($opt{schema} . '_idx' . ++$idx);
my $idx_name = $dbh->quote_identifier($opt{sample_schema} . '_idx' . ++$idx);
my $fk_cols = join ', ', map { $_->[0] } @pairs;
$dbh->do(qq{ CREATE INDEX $idx_name ON $sample_fk_table ($fk_cols) });
}
Expand Down Expand Up @@ -670,7 +684,7 @@ foreach my $name (keys %seq) {
print <<EOF;

SET client_encoding = '$opt{encoding}';
SET standard_conforming_strings = off;
-- SET standard_conforming_strings = off; -- NOTE: commented out (will use the default set to on)
SET check_function_bodies = false;
SET client_min_messages = warning;
SET escape_string_warning = off;
Expand All @@ -681,6 +695,8 @@ notice "Exporting sequences\n";
print "\n";
foreach my $name (sort keys %seq) {
my $constant = quote_constant($name);
my ($schema_name, $table_name) = split('\.', $name, 2);
next if ($opt{schema}) && ($schema_name ne '"'.$opt{schema}.'"');
print "SELECT pg_catalog.setval($constant, $seq{$name});\n";
}
print "\n";
Expand All @@ -698,8 +714,13 @@ foreach my $table (@tables) {
my ($count) = $dbh->selectrow_array("SELECT count(*) FROM $sample_table");
notice "Exporting data from $sample_table ($count)\n";
}
print "COPY $table FROM stdin;\n";
$dbh->do(qq{ COPY $sample_table TO STDOUT });
my ($schema_name, $table_name) = split('\.', $table, 2);
my $cleaned_table_name = substr $table_name, 1, -1;
my $cleaned_schema_name = substr $schema_name, 1, -1;
my ($q) = "SELECT string_agg(quote_ident(column_name), ',') FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_NAME = '$cleaned_table_name' AND TABLE_SCHEMA = '$cleaned_schema_name'";
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@mla this was needed to explicitly persist the order of columns, which may differ in prod vs dev db.

my ($column_names_to_keep_order) = $dbh->selectrow_array($q);
print "COPY $table ($column_names_to_keep_order) FROM stdin WITH CSV DELIMITER E'\\t' QUOTE '\b' ESCAPE '\\';\n";
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@mla this was for mentioned JSONB fields proper escaping

$dbh->do(qq{ COPY $sample_table TO STDOUT WITH CSV DELIMITER E'\\t' QUOTE '\b' ESCAPE '\\'});
my $buffer = '';
print $buffer while $dbh->pg_getcopydata($buffer) >= 0;
print "\\.\n\n";
Expand All @@ -715,8 +736,8 @@ print "\n";
END {
# remove sample tables unless requested not to
if ($created_schema && !$opt{keep}) {
notice "Dropping sample schema $opt{schema}\n";
$dbh->do("DROP SCHEMA $opt{schema} CASCADE");
notice "Dropping sample schema $opt{sample_schema}\n";
$dbh->do("DROP SCHEMA $opt{sample_schema} CASCADE");
}

notice "Done.\n";
Expand Down