-
Notifications
You must be signed in to change notification settings - Fork 21
/
Copy pathd1_create_sql_dataset.sql
99 lines (88 loc) · 2.51 KB
/
d1_create_sql_dataset.sql
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
-- Reset Demo
-- DROP DATABASE dataAnalysisDb;
-- GO
-- Create Database
IF NOT EXISTS (SELECT * FROM master.sys.databases WHERE name = N'dataAnalysisDb')
CREATE DATABASE dataAnalysisDb;
GO
USE dataAnalysisDB;
GO
-- Create Schema
IF NOT EXISTS (SELECT * FROM sys.schemas WHERE name = N'demoData')
EXEC('CREATE SCHEMA demoData');
GO
-- Parquet is an Open Source Format.
-- Data is stored by column instead of by row like most RDBS
-- Supports Compression: Snappy, Gzip, LZO.
-- Support Nested Structure.
IF NOT EXISTS (SELECT * FROM sys.external_file_formats WHERE name = 'ff_parquet')
CREATE EXTERNAL FILE FORMAT [ff_parquet]
WITH ( FORMAT_TYPE = PARQUET)
GO
-- Create the connection to the storage account.
-- Naming convention: ds_[storage account name]_[container]
IF NOT EXISTS (SELECT * FROM sys.external_data_sources WHERE name = 'ds_stdemodatalake001_demo')
CREATE EXTERNAL DATA SOURCE [ds_stdemodatalake001_demo]
WITH (
LOCATION = 'abfss://[email protected]'
)
GO
-- A quick look at the data
-- Result: geolocation data is null
SELECT TOP 100 *
FROM
OPENROWSET(
BULK '/nycTripYellow2019Jan/nycTripYellow2019Jan.snappy.parquet',
DATA_SOURCE = 'ds_stdemodatalake001_demo',
FORMAT = 'PARQUET'
) AS [result]
GO
CREATE EXTERNAL TABLE demoData.nycYellow2019Jan (
[vendorID] nvarchar(4000),
[tpepPickupDateTime] datetime2(7),
[tpepDropoffDateTime] datetime2(7),
[passengerCount] int,
[tripDistance] float,
[puLocationId] nvarchar(4000),
[doLocationId] nvarchar(4000),
[startLon] float,
[startLat] float,
[endLon] float,
[endLat] float,
[rateCodeId] int,
[storeAndFwdFlag] nvarchar(4000),
[paymentType] nvarchar(4000),
[fareAmount] float,
[extra] float,
[mtaTax] float,
[improvementSurcharge] nvarchar(4000),
[tipAmount] float,
[tollsAmount] float,
[totalAmount] float,
[puYear] int,
[puMonth] int
)
WITH (
LOCATION = '/nycTripYellow2019Jan/nycTripYellow2019Jan.snappy.parquet',
DATA_SOURCE = [ds_stdemodatalake001_demo],
FILE_FORMAT = [ff_parquet]
)
GO
-- Note: CETAS (Create External Table As Select) creates a copy of the data.
CREATE EXTERNAL TABLE demoData.copy_nycYellow2019Jan
WITH (
LOCATION = '/demoCopyData/',
DATA_SOURCE = [ds_stdemodatalake001_demo],
FILE_FORMAT = [ff_parquet]
)
AS
SELECT *
FROM
OPENROWSET(
BULK '/nycTripYellow2019Jan/nycTripYellow2019Jan.snappy.parquet',
DATA_SOURCE = 'ds_stdemodatalake001_demo',
FORMAT = 'PARQUET'
) AS [result]
GO
SELECT TOP 100 * FROM demoData.nycYellow2019Jan
GO