Database to File
Examples of using Sling to load data from storage systems to databases
We first need to make sure our connections are available in our environment. See Environment, Storage Connections and Database Connections for more details.
export MY_SOURCE_DB='...'
$ sling conns list
+---------------+------------------+-----------------+
| CONN NAME | CONN TYPE | SOURCE |
+---------------+------------------+-----------------+
| MY_S3_BUCKET | FileSys - S3 | sling env yaml |
| MY_SOURCE_DB | DB - PostgreSQL | env variable |
| MY_GS_BUCKET | FileSys - Google | sling env yaml |
| MY_AZURE_CONT | FileSys - Azure | sling env yaml |
+---------------+------------------+-----------------+
# using windows Powershell
$env:MY_SOURCE_DB = '...'
$ sling conns list
+---------------+------------------+-----------------+
| CONN NAME | CONN TYPE | SOURCE |
+---------------+------------------+-----------------+
| MY_S3_BUCKET | FileSys - S3 | sling env yaml |
| MY_SOURCE_DB | DB - PostgreSQL | env variable |
| MY_GS_BUCKET | FileSys - Google | sling env yaml |
| MY_AZURE_CONT | FileSys - Azure | sling env yaml |
+---------------+------------------+-----------------+
Database ⇨ Local Storage (CSV)
Using CLI Flags
$ sling run --src-conn MY_SOURCE_DB --src-stream 'source_schema.source_table' --tgt-object 'file:///tmp/my_file.csv'
$ sling run --src-conn MY_SOURCE_DB --src-stream 'source_schema.source_table' --tgt-object 'file:///tmp/my_csv_folder/*.csv'
$ sling run --src-conn MY_SOURCE_DB --src-stream 'source_schema.source_table' --tgt-object 'file:///tmp/my_csv_folder/' --tgt-options '{file_max_rows: 100000, format: csv}'
# Windows path format
$ sling run --src-conn MY_SOURCE_DB --src-stream 'source_schema.source_table' --tgt-object 'file://C:/Temp/my_csv_folder/' --tgt-options '{file_max_rows: 100000, format: csv}'
Using Replication
Running with Sling: sling run -r /path/to/replication.yaml
source: MY_SOURCE_DB
target: LOCAL
defaults:
target_options:
format: csv
streams:
source_schema.source_table:
object: file:///tmp/my_file.csv
source_schema.source_table1:
object: file:///tmp/my_csv_folder/*.csv
source_schema.source_table2:
object: file:///tmp/my_csv_folder/
target_options:
file_max_rows: 100000
source_schema.source_table3:
object: file://C:/Temp/my_csv_folder/ # Windows Path format
target_options:
file_max_rows: 100000
# all tables in schema, except "forbidden_table"
my_schema.*:
object: file:///tmp/{stream_schema}/{stream_table}/{YYYY}_{MM}_{DD}/
target_options:
file_max_rows: 400000 # will split files into folder
my_schema.forbidden_table:
disabled: true
Database ⇨ STDOUT
Using CLI Flags
$ sling run --src-conn MY_SOURCE_DB --src-stream 'source_schema.source_table' --stdout
Database ⇨ Local Storage (JSON)
Using CLI Flags
$ sling run --src-conn MY_SOURCE_DB --src-stream 'source_schema.source_table' --tgt-object 'file:///tmp/my_file.json'
$ sling run --src-conn MY_SOURCE_DB --src-stream 'source_schema.source_table' --tgt-object 'file:///tmp/my_json_folder/*.json'
$ sling run --src-conn MY_SOURCE_DB --src-stream 'source_schema.source_table' --tgt-object 'file:///tmp/my_json_folder/' --tgt-options '{file_max_bytes: 4000000, format: json}'
# Windows Path format
$ sling run --src-conn MY_SOURCE_DB --src-stream 'source_schema.source_table' --tgt-object 'file://C:/Temp/my_json_folder/' --tgt-options '{file_max_bytes: 4000000, format: json}'
Using Replication
Running with Sling: sling run -r /path/to/replication.yaml
source: MY_SOURCE_DB
target: LOCAL
defaults:
target_options:
format: json
streams:
source_schema.source_table:
object: file:///tmp/my_file.json
source_schema.source_table1:
object: file:///tmp/my_json_folder/*.json
source_schema.source_table2:
object: file:///tmp/my_json_folder/
target_options:
file_max_bytes: 4000000
source_schema.source_table3:
object: file://C:/Temp/my_json_folder/
target_options:
file_max_bytes: 4000000
# all tables in schema, except "forbidden_table"
my_schema.*:
object: file:///tmp/{stream_schema}/{stream_table}/{YYYY}_{MM}_{DD}/
target_options:
file_max_rows: 400000 # will split files into folder
my_schema.forbidden_table:
disabled: true
Database ⇨ Local Storage (JSON Lines)
Using CLI Flags
$ sling run --src-conn MY_SOURCE_DB --src-stream 'source_schema.source_table' --tgt-object 'file:///tmp/my_file.jsonl' --tgt-options '{format: jsonlines}'
$ sling run --src-conn MY_SOURCE_DB --src-stream 'source_schema.source_table' --tgt-object 'file:///tmp/my_json_folder/*.jsonl'
$ sling run --src-conn MY_SOURCE_DB --src-stream 'source_schema.source_table' --tgt-object 'file:///tmp/my_json_folder/' --tgt-options '{file_max_bytes: 4000000, format: jsonlines}'
# Windows Path format
$ sling run --src-conn MY_SOURCE_DB --src-stream 'source_schema.source_table' --tgt-object 'file://C:/Temp/my_json_folder/' --tgt-options '{file_max_bytes: 4000000, format: jsonlines}'
Using Replication
Running with Sling: sling run -r /path/to/replication.yaml
source: MY_SOURCE_DB
target: LOCAL
defaults:
target_options:
format: jsonlines
streams:
source_schema.source_table:
object: file:///tmp/my_file.jsonl
source_schema.source_table1:
object: file:///tmp/my_jsonlines_folder/*.jsonl
source_schema.source_table2:
object: file:///tmp/my_jsonlines_folder/
target_options:
file_max_bytes: 4000000
source_schema.source_table3:
object: file://C:/Temp/my_jsonlines_folder/ # Windows Path format
target_options:
file_max_bytes: 4000000
# all tables in schema, except "forbidden_table"
my_schema.*:
object: file:///tmp/{stream_schema}/{stream_table}/{YYYY}_{MM}_{DD}/
target_options:
file_max_rows: 400000 # will split files into folder
my_schema.forbidden_table:
disabled: true
Database ⇨ Local Storage (Parquet)
Using CLI Flags
$ sling run --src-conn MY_SOURCE_DB --src-stream 'source_schema.source_table' --tgt-object 'file:///tmp/my_file.parquet'
$ sling run --src-conn MY_SOURCE_DB --src-stream 'source_schema.source_table' --tgt-object 'file:///tmp/my_parquet_folder/*.parquet'
$ sling run --src-conn MY_SOURCE_DB --src-stream 'source_schema.source_table' --tgt-object 'file:///tmp/my_parquet_folder/' --tgt-options '{file_max_bytes: 4000000, format: parquet}'
# Windows Path format
$ sling run --src-conn MY_SOURCE_DB --src-stream 'source_schema.source_table' --tgt-object 'file://C:/Temp/my_parquet_folder/' --tgt-options '{file_max_bytes: 4000000, format: parquet}'
Using Replication
Running with Sling: sling run -r /path/to/replication.yaml
source: MY_SOURCE_DB
target: LOCAL
defaults:
target_options:
format: parquet
streams:
source_schema.source_table:
object: file://C:/Temp/my_file.parquet # Windows Path format
source_schema.source_table1:
object: file://C:/Temp/my_parquet_folder/*.parquet # Windows Path format
source_schema.source_table2:
object: file:///tmp/my_parquet_folder/
target_options:
file_max_rows: 1000000
# all tables in schema, except "forbidden_table"
my_schema.*:
object: file:///tmp/{stream_schema}/{stream_table}/{YYYY}_{MM}_{DD}/
target_options:
file_max_rows: 400000 # will split files into folder
my_schema.forbidden_table:
disabled: true
Database ⇨ Cloud Storage (CSV)
Using CLI Flags
$ sling run --src-conn MY_SOURCE_DB --src-stream 'source_schema.source_table' --tgt-conn MY_S3_BUCKET --tgt-object 's3://my-bucket/my_file.csv'
$ sling run --src-conn MY_SOURCE_DB --src-stream 'source_schema.source_table' --tgt-conn MY_S3_BUCKET --tgt-object 's3://my-bucket/my_csv_folder/*.csv'
$ sling run --src-conn MY_SOURCE_DB --src-stream 'source_schema.source_table' --tgt-conn MY_S3_BUCKET --tgt-object 's3://my-bucket/my_csv_folder/' --tgt-options '{file_max_rows: 100000, format: csv}'
$ sling run --src-conn MY_SOURCE_DB --src-stream 'source_schema.source_table' --tgt-conn MY_GS_BUCKET --tgt-object 'gs://my-bucket/my_file.csv'
$ sling run --src-conn MY_SOURCE_DB --src-stream 'source_schema.source_table' --tgt-conn MY_GS_BUCKET --tgt-object 'gs://my-bucket/my_csv_folder/*.csv'
$ sling run --src-conn MY_SOURCE_DB --src-stream 'source_schema.source_table' --tgt-conn MY_GS_BUCKET --tgt-object 'gs://my-bucket/my_csv_folder/' --tgt-options '{file_max_rows: 100000, format: csv}'
$ sling run --src-conn MY_SOURCE_DB --src-stream 'source_schema.source_table' --tgt-conn MY_AZURE_CONT --tgt-object 'https://my_account.blob.core.windows.net/my-container/my_file.csv'
$ sling run --src-conn MY_SOURCE_DB --src-stream 'source_schema.source_table' --tgt-conn MY_AZURE_CONT --tgt-object 'https://my_account.blob.core.windows.net/my-container/my_csv_folder/*.csv'
$ sling run --src-conn MY_SOURCE_DB --src-stream 'source_schema.source_table' --tgt-conn MY_AZURE_CONT --tgt-object 'https://my_account.blob.core.windows.net/my-container/my_csv_folder/' --tgt-options '{file_max_rows: 100000, format: csv}'
Using Replication
Running with Sling: sling run -r /path/to/replication.yaml
source: MY_SOURCE_DB
target: MY_CLOUD_STORAGE
defaults:
object: {stream_schema}/{stream_table}/{YYYY}_{MM}_{DD}.csv.gz
target_options:
format: csv
compression: gzip
streams:
# all tables in schema
my_schema.*:
object: {stream_schema}/{stream_table}/{YYYY}_{MM}_{DD}/*.csv
target_options:
file_max_bytes: 400000 # will split files into folder
other_schema.source_table: # will use defaults
Database ⇨ Cloud Storage (JSON)
Using CLI Flags
$ sling run --src-conn MY_SOURCE_DB --src-stream 'source_schema.source_table' --tgt-conn MY_S3_BUCKET --tgt-object 's3://my-bucket/my_file.json'
$ sling run --src-conn MY_SOURCE_DB --src-stream 'source_schema.source_table' --tgt-conn MY_S3_BUCKET --tgt-object 's3://my-bucket/my_json_folder/*.json'
$ sling run --src-conn MY_SOURCE_DB --src-stream 'source_schema.source_table' --tgt-conn MY_S3_BUCKET --tgt-object 's3://my-bucket/my_json_folder/' --tgt-options '{file_max_rows: 100000, format: json}'
$ sling run --src-conn MY_SOURCE_DB --src-stream 'source_schema.source_table' --tgt-conn MY_GS_BUCKET --tgt-object 'gs://my-bucket/my_file.json'
$ sling run --src-conn MY_SOURCE_DB --src-stream 'source_schema.source_table' --tgt-conn MY_GS_BUCKET --tgt-object 'gs://my-bucket/my_json_folder/*.json'
$ sling run --src-conn MY_SOURCE_DB --src-stream 'source_schema.source_table' --tgt-conn MY_GS_BUCKET --tgt-object 'gs://my-bucket/my_json_folder/' --tgt-options '{file_max_rows: 100000, format: json}'
$ sling run --src-conn MY_SOURCE_DB --src-stream 'source_schema.source_table' --tgt-conn MY_AZURE_CONT --tgt-object 'https://my_account.blob.core.windows.net/my-container/my_file.json'
$ sling run --src-conn MY_SOURCE_DB --src-stream 'source_schema.source_table' --tgt-conn MY_AZURE_CONT --tgt-object 'https://my_account.blob.core.windows.net/my-container/my_json_folder/*.json'
$ sling run --src-conn MY_SOURCE_DB --src-stream 'source_schema.source_table' --tgt-conn MY_AZURE_CONT --tgt-object 'https://my_account.blob.core.windows.net/my-container/my_json_folder/' --tgt-options '{file_max_rows: 100000, format: json}'
Using Replication
Running with Sling: sling run -r /path/to/replication.yaml
source: MY_SOURCE_DB
target: MY_CLOUD_STORAGE
defaults:
object: {stream_schema}/{stream_table}/{YYYY}_{MM}_{DD}.json.gz
target_options:
format: json
compression: gzip
streams:
# all tables in schema
my_schema.*:
object: {stream_schema}/{stream_table}/{YYYY}_{MM}_{DD}/*.json
target_options:
file_max_bytes: 400000 # will split files into folder
other_schema.source_table: # will use defaults
Database ⇨ Cloud Storage (JSON Lines)
Using CLI Flags
$ sling run --src-conn MY_SOURCE_DB --src-stream 'source_schema.source_table' --tgt-conn MY_S3_BUCKET --tgt-object 's3://my-bucket/my_file.jsonl'
$ sling run --src-conn MY_SOURCE_DB --src-stream 'source_schema.source_table' --tgt-conn MY_S3_BUCKET --tgt-object 's3://my-bucket/my_json_folder/*.jsonl'
$ sling run --src-conn MY_SOURCE_DB --src-stream 'source_schema.source_table' --tgt-conn MY_S3_BUCKET --tgt-object 's3://my-bucket/my_json_folder/' --tgt-options '{file_max_rows: 100000, format: jsonlines}'
$ sling run --src-conn MY_SOURCE_DB --src-stream 'source_schema.source_table' --tgt-conn MY_GS_BUCKET --tgt-object 'gs://my-bucket/my_file.jsonl'
$ sling run --src-conn MY_SOURCE_DB --src-stream 'source_schema.source_table' --tgt-conn MY_GS_BUCKET --tgt-object 'gs://my-bucket/my_json_folder/*.jsonl'
$ sling run --src-conn MY_SOURCE_DB --src-stream 'source_schema.source_table' --tgt-conn MY_GS_BUCKET --tgt-object 'gs://my-bucket/my_json_folder/' --tgt-options '{file_max_rows: 100000, format: jsonlines}'
$ sling run --src-conn MY_SOURCE_DB --src-stream 'source_schema.source_table' --tgt-conn MY_AZURE_CONT --tgt-object 'https://my_account.blob.core.windows.net/my-container/my_file.jsonl'
$ sling run --src-conn MY_SOURCE_DB --src-stream 'source_schema.source_table' --tgt-conn MY_AZURE_CONT --tgt-object 'https://my_account.blob.core.windows.net/my-container/my_json_folder/*.jsonl'
$ sling run --src-conn MY_SOURCE_DB --src-stream 'source_schema.source_table' --tgt-conn MY_AZURE_CONT --tgt-object 'https://my_account.blob.core.windows.net/my-container/my_json_folder/' --tgt-options '{file_max_rows: 100000, format: jsonlines}'
Using Replication
Running with Sling: sling run -r /path/to/replication.yaml
source: MY_SOURCE_DB
target: MY_CLOUD_STORAGE
defaults:
object: {stream_schema}/{stream_table}/{YYYY}_{MM}_{DD}.jsonl.gz
target_options:
format: jsonlines
compression: gzip
streams:
# all tables in schema, except "forbidden_table"
my_schema.*:
object: {stream_schema}/{stream_table}/{YYYY}_{MM}_{DD}/*.jsonl
target_options:
file_max_bytes: 400000 # will split files into folder
my_schema.forbidden_table:
disabled: true
other_schema.source_table: # will use defaults
Database ⇨ Cloud Storage (Parquet)
Using CLI Flags
$ sling run --src-conn MY_SOURCE_DB --src-stream 'source_schema.source_table' --tgt-conn MY_S3_BUCKET --tgt-object 's3://my-bucket/my_file.parquet'
$ sling run --src-conn MY_SOURCE_DB --src-stream 'source_schema.source_table' --tgt-conn MY_S3_BUCKET --tgt-object 's3://my-bucket/my_parquet_folder/*.parquet'
$ sling run --src-conn MY_SOURCE_DB --src-stream 'source_schema.source_table' --tgt-conn MY_S3_BUCKET --tgt-object 's3://my-bucket/my_parquet_folder/' --tgt-options '{file_max_rows: 100000, format: parquet}'
$ sling run --src-conn MY_SOURCE_DB --src-stream 'source_schema.source_table' --tgt-conn MY_GS_BUCKET --tgt-object 'gs://my-bucket/my_file.parquet'
$ sling run --src-conn MY_SOURCE_DB --src-stream 'source_schema.source_table' --tgt-conn MY_GS_BUCKET --tgt-object 'gs://my-bucket/my_parquet_folder/*.parquet'
$ sling run --src-conn MY_SOURCE_DB --src-stream 'source_schema.source_table' --tgt-conn MY_GS_BUCKET --tgt-object 'gs://my-bucket/my_parquet_folder/' --tgt-options '{file_max_rows: 100000, format: parquet}'
$ sling run --src-conn MY_SOURCE_DB --src-stream 'source_schema.source_table' --tgt-conn MY_AZURE_CONT --tgt-object 'https://my_account.blob.core.windows.net/my-container/my_file.parquet'
$ sling run --src-conn MY_SOURCE_DB --src-stream 'source_schema.source_table' --tgt-conn MY_AZURE_CONT --tgt-object 'https://my_account.blob.core.windows.net/my-container/my_parquet_folder/*.parquet'
$ sling run --src-conn MY_SOURCE_DB --src-stream 'source_schema.source_table' --tgt-conn MY_AZURE_CONT --tgt-object 'https://my_account.blob.core.windows.net/my-container/my_parquet_folder/' --tgt-options '{file_max_rows: 100000, format: parquet}'
Using Replication
Running with Sling: sling run -r /path/to/replication.yaml
source: MY_SOURCE_DB
target: MY_CLOUD_STORAGE
defaults:
object: {stream_schema}/{stream_table}/{YYYY}_{MM}_{DD}.parquet
target_options:
format: parquet
streams:
# all tables in schema, except "forbidden_table"
my_schema.*:
object: {stream_schema}/{stream_table}/{YYYY}_{MM}_{DD}/*.parquet
target_options:
file_max_rows: 400000 # will split files into folder
my_schema.forbidden_table:
disabled: true
other_schema.source_table: # will use defaults
Last updated