-
Notifications
You must be signed in to change notification settings - Fork 7
/
CreateBabbyNames.sql
executable file
·645 lines (502 loc) · 16.2 KB
/
CreateBabbyNames.sql
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
/****************************************
Copyright 2018 Kendra Little
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
https://opensource.org/licenses/MIT
****************************************/
/*************************************
Notes:
This script has been tested against SQL Server 2008R2 and SQL Server 2017 Developer Edition.
SQL Server 2017 Dev Edition is free:
https://www.microsoft.com/en-us/sql-server/sql-server-downloads-free-trial
Instructions:
Download names.zip from https://catalog.data.gov/dataset/baby-names-from-social-security-card-applications-national-level-data
The original file is licensed under cc-zero (public domain dedication)
Download namesbystate.zip from https://catalog.data.gov/dataset/baby-names-from-social-security-card-applications-data-by-state-and-district-of-
Extract all files into C:\BabbyNamesImport
Modify the DataFilePath and LogFilePath in the script below to a location where you want your data and log files
Review the script and make sure the settings make sense for your instance
Enable SQLCMD in this SSMS Session (Query Menu, SQLCMD mode)
Run the script to import tables and create procedures.
Uncomment the backup command at the end and run that command alone to create compressed backup for distribution.
*************************************/
/* Don't remove the trailing \ or it will fail. */
:SETVAR DataSourcePath "C:\BabbyNamesImport\"
/* Database data file path - requires 5GB */
:SETVAR DataFilePath "C:\MSSQL\Data\"
/* Database log file path - requires 1GB, can be the same as data file path */
:SETVAR LogFilePath "C:\MSSQL\Data\"
/* Most recent year you have data */
:SETVAR MaxYear 2017
/****************************************
SQL Server Settings
****************************************/
exec sp_configure 'show advanced options', 1;
GO
RECONFIGURE
GO
exec sp_configure 'max server memory (MB)', 3500;
GO
RECONFIGURE
GO
/*
We're using defaults here for parallelism settings
These ain't always the best for production
But they're where we're starting for learning */
exec sp_configure 'cost threshold for parallelism', 5;
GO
exec sp_configure 'max degree of parallelism', 0;
GO
RECONFIGURE
GO
/****************************************
SET stuff n things
****************************************/
SET ANSI_NULLS ON
GO
SET QUOTED_IDENTIFIER ON
GO
SET ANSI_PADDING ON
GO
SET STATISTICS TIME, IO OFF;
GO
SET NOCOUNT ON;
GO
:SETVAR Versn "1_300"
/****************************************
Recreate BabbyNames database.
****************************************/
DECLARE @msg NVARCHAR(MAX)
SET @msg=N'Drop BabbyNames database if it exists.'
DECLARE @msgtime NVARCHAR(24)
SET @msgtime = CONVERT(NVARCHAR(21), SYSDATETIME(), 121) + N': ';
SET @msg=@msgtime+@msg;
RAISERROR (@msg, 0, 1) WITH NOWAIT;
GO
USE master;
GO
IF DB_ID('BabbyNames') IS NOT NULL
BEGIN
ALTER DATABASE BabbyNames SET SINGLE_USER WITH ROLLBACK IMMEDIATE;
DROP DATABASE BabbyNames;
END
GO
DECLARE @msg NVARCHAR(MAX)
SET @msg = N'Create BabbyNames database.';
DECLARE @msgtime NVARCHAR(24)
SET @msgtime = CONVERT(NVARCHAR(21), SYSDATETIME(), 121) + N': ';
SET @msg=@msgtime+@msg;
RAISERROR (@msg, 0, 1) WITH NOWAIT;
GO
CREATE DATABASE BabbyNames
ON PRIMARY
(NAME = BabbyNames, FILENAME = N'$(DataFilePath)BabbyNames.mdf',
SIZE = 1GB, FILEGROWTH = 256MB, MAXSIZE=50GB)
LOG ON
(NAME = BabbyNames_log, FILENAME = N'$(LogFilePath)BabbyNames_log.ldf',
SIZE = 512MB, FILEGROWTH = 256MB, MAXSIZE=20GB)
GO
/****************************************
Use BabbyNames and create logging proc and table.
The logging proc contains the version info. Because I'm lazy.
****************************************/
DECLARE @msg NVARCHAR(MAX);
SET @msg= N'USE BabbyNames and create evt schema.';
DECLARE @msgtime NVARCHAR(24);
SET @msgtime = CONVERT(NVARCHAR(21), SYSDATETIME(), 121) + N': ';
SET @msg=@msgtime+@msg;
RAISERROR (@msg, 0, 1) WITH NOWAIT;
GO
USE BabbyNames;
GO
CREATE SCHEMA evt AUTHORIZATION dbo;
GO
IF OBJECT_ID('evt.logme') IS NULL
EXEC ('CREATE PROCEDURE evt.logme as RETURN 0;')
GO
ALTER PROCEDURE evt.logme
@msg NVARCHAR(MAX)
AS
SET NOCOUNT ON;
DECLARE @ver NVARCHAR(128);
SET @ver = N'(ver $(Versn)) ';
SET @msg=@ver+@msg;
DECLARE @msgtime NVARCHAR(24);
SET @msgtime = CONVERT(NVARCHAR(21), SYSDATETIME(), 121) + N': ';
IF OBJECT_ID ('evt.Log') IS NOT NULL
INSERT evt.Log (LogEntry) VALUES (@msg);
SET @msg=@msgtime+@msg;
RAISERROR (@msg, 0, 1) WITH NOWAIT;
GO
EXEC evt.logme N'Create evt.Log table.';
GO
CREATE TABLE evt.[Log] (
LogId INT IDENTITY NOT NULL,
LogDate DATETIME2(0) NOT NULL
CONSTRAINT df_LogDate_sysdatetime
DEFAULT SYSDATETIME(),
LogEntry NVARCHAR(MAX),
);
GO
ALTER TABLE evt.[Log]
ADD CONSTRAINT pk_evtLog
PRIMARY KEY CLUSTERED (LogId)
WITH (MAXDOP=1);
GO
/****************************************
Load the national src tables from disk using BULK INSERT
****************************************/
EXEC evt.logme N'Load the national src tables from disk using BULK INSERT.';
GO
CREATE SCHEMA src AUTHORIZATION dbo;
GO
DECLARE @min int;
SET @min = 1880;
DECLARE @max int;
SELECT @max = $(MaxYear);
DECLARE @tablename nvarchar(256);
DECLARE @dsql nvarchar(max);
WHILE @min <= @max
BEGIN
SET @tablename=N'names_' + cast (@min as NCHAR(4))
EXEC evt.logme @tablename;
IF (SELECT COUNT(*) from sys.objects where name=@tablename) = 1
BEGIN
SET @dsql=N'
DROP TABLE src.' + @tablename + N';';
EXEC(@dsql);
END
BEGIN
SET @dsql=N'
CREATE TABLE src.' + @tablename + N' (
FirstName varchar(255) NOT NULL,
Gender char(1) NOT NULL,
NameCount INT NOT NULL
);
BULK INSERT BabbyNames.src.' + @tablename + N'
FROM ''$(DataSourcePath)yob' + cast (@min as char(4)) + N'.txt''
WITH ( FIELDTERMINATOR = '','', ROWTERMINATOR = ''\n'');
ALTER TABLE src.' + @tablename + N'
ADD ReportYear INT NOT NULL
DEFAULT(' + cast (@min as char(4)) + N');
ALTER TABLE src.' + @tablename + N'
ADD CONSTRAINT pk_src_' + @tablename + N' PRIMARY KEY CLUSTERED (ReportYear, FirstName, Gender);
ALTER TABLE src.' + @tablename + N' WITH CHECK
ADD CONSTRAINT ck_src_' + @tablename + N' CHECK (ReportYear = ' + cast (@min as char(4)) + N')
';
EXEC(@dsql);
END
SET @min=@min+1;
END
GO
/****************************************
Create a partitioned view for national src tables.
****************************************/
EXEC evt.logme N'Create the partitioned view for national src tables.';
IF (SELECT COUNT(*) from sys.objects where name='names_all') = 1
DROP VIEW src.names_all ;
GO
DECLARE @min INT;
SET @min=1881;
DECLARE @max INT;
SELECT @max = $(MaxYear);
DECLARE @tablename NVARCHAR(256);
DECLARE @dsql NVARCHAR(MAX);
SET @dsql = N'
CREATE VIEW src.names_all
AS
SELECT * FROM src.names_1880' + CHAR(10);
WHILE @min <= @max
BEGIN
SET @tablename=N'src.names_' + cast (@min as char(4))
SET @dsql=@dsql+
' UNION ALL ' + CHAR(10) +
' SELECT * FROM ' + @tablename;
SET @min=@min+1;
END
EXEC (@dsql);
GO
/****************************************
Create and load ref.FirstName and agg.FirstNameByYear
****************************************/
EXEC evt.logme N'Create schema ref.';
GO
CREATE SCHEMA ref AUTHORIZATION dbo;
GO
EXEC evt.logme N'Create ref.FirstName.';
GO
/* Not persisting NameLength just for demo opportunities later */
CREATE TABLE ref.FirstName(
FirstNameId INT IDENTITY NOT NULL,
FirstName VARCHAR(255) NOT NULL,
NameLength AS LEN(FirstName),
FirstReportYear INT NOT NULL,
LastReportYear INT NOT NULL,
TotalNameCount BIGINT NOT NULL
);
GO
EXEC evt.logme N'Key ref.FirstName.';
GO
ALTER TABLE ref.FirstName
ADD CONSTRAINT pk_FirstName_FirstNameId
PRIMARY KEY CLUSTERED (FirstNameId);
GO
EXEC evt.logme N'Load ref.FirstName.';
GO
INSERT ref.FirstName WITH (TABLOCK)
(FirstName,
FirstReportYear,
LastReportYear,
TotalNameCount)
SELECT
FirstName,
MIN(ReportYear) as FirstReportYear,
MAX(ReportYear) as LastReportYear,
SUM(NameCount) as TotalNameCount
FROM src.names_all
GROUP BY FirstName
OPTION (QUERYTRACEON 610);
GO
EXEC evt.logme N'Create schema agg.';
GO
CREATE SCHEMA agg AUTHORIZATION dbo;
GO
EXEC evt.logme N'Create agg.FirstNameByYear';
GO
CREATE TABLE agg.FirstNameByYear (
ReportYear INT NOT NULL,
FirstNameId INT NOT NULL,
Gender char(1) NOT NULL,
NameCount INT NOT NULL
);
GO
EXEC evt.logme N'Load agg.FirstNameByYear';
GO
INSERT agg.FirstNameByYear WITH (TABLOCK)
(ReportYear, FirstNameId, Gender, NameCount)
SELECT
na.ReportYear,
fn.FirstNameId,
na.Gender,
na.NameCount
FROM src.names_all na
JOIN ref.FirstName fn on na.FirstName=fn.FirstName;
GO
EXEC evt.logme N'Key agg.FirstNameByYear';
GO
ALTER TABLE agg.FirstNameByYear
ADD CONSTRAINT pk_aggFirstNameByYear
PRIMARY KEY CLUSTERED (ReportYear, FirstNameId, Gender)
WITH (MAXDOP=1);
GO
EXEC evt.logme N'Create foreign key on agg.FirstNameByYear referencing ref.FirstName';
GO
ALTER TABLE agg.FirstNameByYear
ADD CONSTRAINT fk_FirstNameByYear_FirstName
FOREIGN KEY (FirstNameId)
REFERENCES ref.FirstName(FirstNameId);
GO
/****************************************
Drop national src objects
****************************************/
EXEC evt.logme N'Drop national src objects.';
GO
IF (SELECT COUNT(*) from sys.objects where name='names_all') = 1
DROP VIEW src.names_all ;
GO
DECLARE @min INT;
SET @min=1880;
DECLARE @max INT;
SELECT @max = $(MaxYear);
DECLARE @tablename NVARCHAR(256);
DECLARE @dsql NVARCHAR(MAX);
WHILE @min <= @max
BEGIN
SET @tablename = N'src.names_' + cast (@min as nchar(4))
SET @dsql = N'DROP TABLE ' + @tablename
EXEC evt.logme @dsql;
IF (SELECT OBJECT_ID(@tablename)) IS NOT NULL
BEGIN
EXEC(@dsql);
END
ELSE
EXEC evt.logme N'Table doesn''t exist, skipping.';
SET @min=@min+1;
END
GO
/****************************************
Create and populate ref.State
****************************************/
EXEC evt.logme N'Create and populate ref.State.';
GO
CREATE TABLE ref.State (
StateCode CHAR(2) NOT NULL,
StateName VARCHAR(128) NOT NULL,
CONSTRAINT pk_ref_State PRIMARY KEY CLUSTERED (StateCode)
);
GO
INSERT ref.State (StateCode, StateName)
VALUES
('AL', 'Alabama'),
('AK', 'Alaska'),
('AZ', 'Arizona'),
('AR', 'Arkansas'),
('CA', 'California'),
('CO', 'Colorado'),
('CT', 'Connecticut'),
('DE', 'Delaware'),
('DC', 'District of Columbia'),
('FL', 'Florida'),
('GA', 'Georgia'),
('HI', 'Hawaii'),
('ID', 'Idaho'),
('IL', 'Illinois'),
('IN', 'Indiana'),
('IA', 'Iowa'),
('KS', 'Kansas'),
('KY', 'Kentucky'),
('LA', 'Louisiana'),
('ME', 'Maine'),
('MD', 'Maryland'),
('MA', 'Massachusetts'),
('MI', 'Michigan'),
('MN', 'Minnesota'),
('MS', 'Mississippi'),
('MO', 'Missouri'),
('MT', 'Montana'),
('NE', 'Nebraska'),
('NV', 'Nevada'),
('NH', 'New Hampshire'),
('NJ', 'New Jersey'),
('NM', 'New Mexico'),
('NY', 'New York'),
('NC', 'North Carolina'),
('ND', 'North Dakota'),
('OH', 'Ohio'),
('OK', 'Oklahoma'),
('OR', 'Oregon'),
('PA', 'Pennsylvania'),
('RI', 'Rhode Island'),
('SC', 'South Carolina'),
('SD', 'South Dakota'),
('TN', 'Tennessee'),
('TX', 'Texas'),
('UT', 'Utah'),
('VT', 'Vermont'),
('VA', 'Virginia'),
('WA', 'Washington'),
('WV', 'West Virginia'),
('WI', 'Wisconsin'),
('WY', 'Wyoming')
/****************************************
Load src.StateDataRaw from disk
****************************************/
EXEC evt.logme N'Load src.StateDataRaw from disk.';
GO
CREATE TABLE src.StateDataRaw (
StateCode CHAR(2) NOT NULL,
Gender CHAR(1) NOT NULL,
BirthYear INT NOT NULL,
FirstName nvarchar(15) NOT NULL,
NameCount int NOT NULL
);
declare @tablename nvarchar(256),
@dsql nvarchar(max),
@statecode char(2),
@statename varchar(128);
DECLARE @StateDataLoad as CURSOR;
SET @StateDataLoad = CURSOR FOR
SELECT StateCode, StateName
FROM ref.State;
OPEN @StateDataLoad;
FETCH NEXT FROM @StateDataLoad INTO @statecode, @statename;
WHILE @@FETCH_STATUS = 0
BEGIN
EXEC evt.logme @statecode;
BEGIN
SET @dsql=N'
BULK INSERT BabbyNames.src.StateDataRaw
FROM ''$(DataSourcePath)' + @statecode + N'.TXT''
WITH ( FIELDTERMINATOR = '','', ROWTERMINATOR = ''\n'');
';
EXEC(@dsql);
END
FETCH NEXT FROM @StateDataLoad INTO @statecode, @statename;
END
CLOSE @StateDataLoad;
DEALLOCATE @StateDataLoad;
ALTER TABLE src.StateDataRaw
ADD CONSTRAINT pk_src_StateDataRaw PRIMARY KEY CLUSTERED
(StateCode, BirthYear, FirstName, Gender);
GO
/****************************************
Create and populate agg.FirstNameByYearState
****************************************/
EXEC evt.logme N'Create and populate agg.FirstNameByYearState';
GO
CREATE TABLE agg.FirstNameByYearState (
ReportYear INT NOT NULL,
StateCode char(2) NOT NULL,
FirstNameId INT NOT NULL,
Gender char(1) NOT NULL,
NameCount INT NOT NULL
);
GO
INSERT agg.FirstNameByYearState WITH (TABLOCK)
(ReportYear, StateCode, FirstNameId, Gender, NameCount)
SELECT
na.BirthYear,
na.StateCode,
fn.FirstNameId,
na.Gender,
na.NameCount
FROM src.StateDataRaw na
JOIN ref.FirstName fn on na.FirstName=fn.FirstName;
GO
EXEC evt.logme N'Key agg.FirstNameByYearState';
GO
ALTER TABLE agg.FirstNameByYearState
ADD CONSTRAINT pk_aggFirstNameByYearState
PRIMARY KEY CLUSTERED (ReportYear, StateCode, FirstNameId, Gender);
GO
EXEC evt.logme N'Create foreign key on agg.FirstNameByYearState referencing ref.FirstName';
GO
ALTER TABLE agg.FirstNameByYearState
ADD CONSTRAINT fk_FirstNameByYearState_FirstName
FOREIGN KEY (FirstNameId)
REFERENCES ref.FirstName(FirstNameId);
GO
DROP TABLE src.StateDataRaw;
GO
/****************************************
Report back on index sizes & finish up
****************************************/
TRUNCATE TABLE evt.Log;
GO
CHECKPOINT
GO
SELECT
sc.name + '.' + so.name as table_name,
ps.index_id as index_id,
ps.reserved_page_count * 8. /1024. as size_MB,
ps.row_count
FROM sys.dm_db_partition_stats ps
JOIN sys.objects so on ps.object_id=so.object_id
JOIN sys.schemas sc on so.schema_id=sc.schema_id
WHERE so.is_ms_shipped=0
ORDER BY size_MB desc;
GO
DECLARE @min int
DECLARE @max int
DECLARE @msg nvarchar(max)
SELECT @min = MIN(ReportYear), @max = MAX(ReportYear) from agg.FirstNameByYear;
SELECT @msg = N'agg.FirstNameByYear has data from ' + cast(@min as nvarchar(4)) + N' to ' + cast(@max as nvarchar(4));
EXEC evt.logme @msg;
SELECT @min = MIN(ReportYear), @max = MAX(ReportYear) from agg.FirstNameByYearState;
SELECT @msg = N'agg.FirstNameByYearState has data from ' + cast(@min as nvarchar(4)) + N' to ' + cast(@max as nvarchar(4));
EXEC evt.logme @msg;
GO
--BACKUP DATABASE BabbyNames TO DISK = N'S:\MSSQL\Backup\BabbyNames.bak'
-- WITH COMPRESSION, FORMAT, INIT;
--GO