Database to File
Examples of using Sling to load data from databases to storage systems
Last updated
Examples of using Sling to load data from databases to storage systems
Last updated
We first need to make sure our connections are available in our environment. See , and for more details.
export MY_SOURCE_DB='...'
$ sling conns list
+---------------+------------------+-----------------+
| CONN NAME | CONN TYPE | SOURCE |
+---------------+------------------+-----------------+
| MY_S3_BUCKET | FileSys - S3 | sling env yaml |
| MY_SOURCE_DB | DB - PostgreSQL | env variable |
| MY_GS_BUCKET | FileSys - Google | sling env yaml |
| MY_AZURE_CONT | FileSys - Azure | sling env yaml |
+---------------+------------------+-----------------+
# using windows Powershell
$env:MY_SOURCE_DB = '...'
$ sling conns list
+---------------+------------------+-----------------+
| CONN NAME | CONN TYPE | SOURCE |
+---------------+------------------+-----------------+
| MY_S3_BUCKET | FileSys - S3 | sling env yaml |
| MY_SOURCE_DB | DB - PostgreSQL | env variable |
| MY_GS_BUCKET | FileSys - Google | sling env yaml |
| MY_AZURE_CONT | FileSys - Azure | sling env yaml |
+---------------+------------------+-----------------+
Using
$ sling run --src-conn MY_SOURCE_DB \
--src-stream 'source_schema.source_table' \
--tgt-object 'file:///tmp/my_file.csv'
$ sling run --src-conn MY_SOURCE_DB \
--src-stream 'source_schema.source_table' \
--tgt-object 'file:///tmp/my_csv_folder/*.csv'
$ sling run --src-conn MY_SOURCE_DB \
--src-stream 'source_schema.source_table' \
--tgt-object 'file:///tmp/my_csv_folder/' \
--tgt-options '{file_max_rows: 100000, format: csv}'
# Windows path format
$ sling run --src-conn MY_SOURCE_DB \
--src-stream 'source_schema.source_table' \
--tgt-object 'file://C:/Temp/my_csv_folder/' \
--tgt-options '{file_max_rows: 100000, format: csv}'
Using
Running with Sling: sling run -r /path/to/replication.yaml
source: MY_SOURCE_DB
target: LOCAL
defaults:
target_options:
format: csv
streams:
source_schema.source_table:
object: file:///tmp/my_file.csv
source_schema.source_table1:
object: file:///tmp/my_csv_folder/*.csv
source_schema.source_table2:
object: file:///tmp/my_csv_folder/
target_options:
file_max_rows: 100000
source_schema.source_table3:
object: file://C:/Temp/my_csv_folder/ # Windows Path format
target_options:
file_max_rows: 100000
# all tables in schema, except "forbidden_table"
my_schema.*:
object: file:///tmp/{stream_schema}/{stream_table}/{YYYY}_{MM}_{DD}/
target_options:
file_max_rows: 400000 # will split files into folder
my_schema.forbidden_table:
disabled: true
env:
SLING_THREADS: 3 # run streams concurrently
Using
$ sling run --src-conn MY_SOURCE_DB --src-stream 'source_schema.source_table' --stdout
$ sling run --src-conn MY_SOURCE_DB \
--src-stream 'source_schema.source_table' \
--tgt-object 'file:///tmp/my_file.json'
$ sling run --src-conn MY_SOURCE_DB \
--src-stream 'source_schema.source_table' \
--tgt-object 'file:///tmp/my_json_folder/*.json'
$ sling run --src-conn MY_SOURCE_DB \
--src-stream 'source_schema.source_table' \
--tgt-object 'file:///tmp/my_json_folder/' \
--tgt-options '{file_max_bytes: 4000000, format: json}'
# Windows Path format
$ sling run --src-conn MY_SOURCE_DB \
--src-stream 'source_schema.source_table' \
--tgt-object 'file://C:/Temp/my_json_folder/' \
--tgt-options '{file_max_bytes: 4000000, format: json}'
Running with Sling: sling run -r /path/to/replication.yaml
source: MY_SOURCE_DB
target: LOCAL
defaults:
target_options:
format: json
streams:
source_schema.source_table:
object: file:///tmp/my_file.json
source_schema.source_table1:
object: file:///tmp/my_json_folder/*.json
source_schema.source_table2:
object: file:///tmp/my_json_folder/
target_options:
file_max_bytes: 4000000
source_schema.source_table3:
object: file://C:/Temp/my_json_folder/
target_options:
file_max_bytes: 4000000
# all tables in schema, except "forbidden_table"
my_schema.*:
object: file:///tmp/{stream_schema}/{stream_table}/{YYYY}_{MM}_{DD}/
target_options:
file_max_rows: 400000 # will split files into folder
my_schema.forbidden_table:
disabled: true
env:
SLING_THREADS: 3 # run streams concurrently
$ sling run --src-conn MY_SOURCE_DB \
--src-stream 'source_schema.source_table' \
--tgt-object 'file:///tmp/my_file.jsonl' \
--tgt-options '{format: jsonlines}'
$ sling run --src-conn MY_SOURCE_DB \
--src-stream 'source_schema.source_table' \
--tgt-object 'file:///tmp/my_json_folder/*.jsonl'
$ sling run --src-conn MY_SOURCE_DB \
--src-stream 'source_schema.source_table' \
--tgt-object 'file:///tmp/my_json_folder/' \
--tgt-options '{file_max_bytes: 4000000, format: jsonlines}'
# Windows Path format
$ sling run --src-conn MY_SOURCE_DB \
--src-stream 'source_schema.source_table' \
--tgt-object 'file://C:/Temp/my_json_folder/' \
--tgt-options '{file_max_bytes: 4000000, format: jsonlines}'
Running with Sling: sling run -r /path/to/replication.yaml
source: MY_SOURCE_DB
target: LOCAL
defaults:
target_options:
format: jsonlines
streams:
source_schema.source_table:
object: file:///tmp/my_file.jsonl
source_schema.source_table1:
object: file:///tmp/my_jsonlines_folder/*.jsonl
source_schema.source_table2:
object: file:///tmp/my_jsonlines_folder/
target_options:
file_max_bytes: 4000000
source_schema.source_table3:
object: file://C:/Temp/my_jsonlines_folder/ # Windows Path format
target_options:
file_max_bytes: 4000000
# all tables in schema, except "forbidden_table"
my_schema.*:
object: file:///tmp/{stream_schema}/{stream_table}/{YYYY}_{MM}_{DD}/
target_options:
file_max_rows: 400000 # will split files into folder
my_schema.forbidden_table:
disabled: true
env:
SLING_THREADS: 3 # run streams concurrently
$ sling run --src-conn MY_SOURCE_DB \
--src-stream 'source_schema.source_table' \
--tgt-object 'file:///tmp/my_file.parquet'
$ sling run --src-conn MY_SOURCE_DB \
--src-stream 'source_schema.source_table' \
--tgt-object 'file:///tmp/my_parquet_folder/*.parquet'
$ sling run --src-conn MY_SOURCE_DB \
--src-stream 'source_schema.source_table' \
--tgt-object 'file:///tmp/my_parquet_folder/' \
--tgt-options '{file_max_rows: 4000000, format: parquet}'
# Windows Path format
$ sling run --src-conn MY_SOURCE_DB \
--src-stream 'source_schema.source_table' \
--tgt-object 'file://C:/Temp/my_parquet_folder/' \
--tgt-options '{file_max_rows: 4000000, format: parquet}'
Running with Sling: sling run -r /path/to/replication.yaml
source: MY_SOURCE_DB
target: LOCAL
defaults:
target_options:
format: parquet
streams:
source_schema.source_table:
object: file://C:/Temp/my_file.parquet # Windows Path format
source_schema.source_table1:
object: file://C:/Temp/my_parquet_folder/*.parquet # Windows Path format
source_schema.source_table2:
object: file:///tmp/my_parquet_folder/
target_options:
file_max_rows: 1000000
# all tables in schema, except "forbidden_table"
my_schema.*:
object: file:///tmp/{stream_schema}/{stream_table}/{YYYY}_{MM}_{DD}/
target_options:
file_max_rows: 400000 # will split files into folder
my_schema.forbidden_table:
disabled: true
env:
SLING_THREADS: 3 # run streams concurrently
$ sling run --src-conn MY_SOURCE_DB --src-stream 'source_schema.source_table' --tgt-conn MY_S3_BUCKET --tgt-object 's3://my-bucket/my_file.csv'
$ sling run --src-conn MY_SOURCE_DB --src-stream 'source_schema.source_table' --tgt-conn MY_S3_BUCKET --tgt-object 's3://my-bucket/my_csv_folder/*.csv'
$ sling run --src-conn MY_SOURCE_DB --src-stream 'source_schema.source_table' --tgt-conn MY_S3_BUCKET --tgt-object 's3://my-bucket/my_csv_folder/' --tgt-options '{file_max_rows: 100000, format: csv}'
$ sling run --src-conn MY_SOURCE_DB --src-stream 'source_schema.source_table' --tgt-conn MY_GS_BUCKET --tgt-object 'gs://my-bucket/my_file.csv'
$ sling run --src-conn MY_SOURCE_DB --src-stream 'source_schema.source_table' --tgt-conn MY_GS_BUCKET --tgt-object 'gs://my-bucket/my_csv_folder/*.csv'
$ sling run --src-conn MY_SOURCE_DB --src-stream 'source_schema.source_table' --tgt-conn MY_GS_BUCKET --tgt-object 'gs://my-bucket/my_csv_folder/' --tgt-options '{file_max_rows: 100000, format: csv}'
$ sling run --src-conn MY_SOURCE_DB --src-stream 'source_schema.source_table' --tgt-conn MY_AZURE_CONT --tgt-object 'https://my_account.blob.core.windows.net/my-container/my_file.csv'
$ sling run --src-conn MY_SOURCE_DB --src-stream 'source_schema.source_table' --tgt-conn MY_AZURE_CONT --tgt-object 'https://my_account.blob.core.windows.net/my-container/my_csv_folder/*.csv'
$ sling run --src-conn MY_SOURCE_DB --src-stream 'source_schema.source_table' --tgt-conn MY_AZURE_CONT --tgt-object 'https://my_account.blob.core.windows.net/my-container/my_csv_folder/' --tgt-options '{file_max_rows: 100000, format: csv}'
Running with Sling: sling run -r /path/to/replication.yaml
source: MY_SOURCE_DB
target: MY_CLOUD_STORAGE
defaults:
object: {stream_schema}/{stream_table}/{YYYY}_{MM}_{DD}.csv.gz
target_options:
format: csv
compression: gzip
streams:
# all tables in schema
my_schema.*:
object: {stream_schema}/{stream_table}/{YYYY}_{MM}_{DD}/*.csv
target_options:
file_max_rows: 400000 # will split files into folder
other_schema.source_table: # will use defaults
env:
SLING_THREADS: 3 # run streams concurrently
$ sling run --src-conn MY_SOURCE_DB --src-stream 'source_schema.source_table' --tgt-conn MY_S3_BUCKET --tgt-object 's3://my-bucket/my_file.json'
$ sling run --src-conn MY_SOURCE_DB --src-stream 'source_schema.source_table' --tgt-conn MY_S3_BUCKET --tgt-object 's3://my-bucket/my_json_folder/*.json'
$ sling run --src-conn MY_SOURCE_DB --src-stream 'source_schema.source_table' --tgt-conn MY_S3_BUCKET --tgt-object 's3://my-bucket/my_json_folder/' --tgt-options '{file_max_rows: 100000, format: json}'
$ sling run --src-conn MY_SOURCE_DB --src-stream 'source_schema.source_table' --tgt-conn MY_GS_BUCKET --tgt-object 'gs://my-bucket/my_file.json'
$ sling run --src-conn MY_SOURCE_DB --src-stream 'source_schema.source_table' --tgt-conn MY_GS_BUCKET --tgt-object 'gs://my-bucket/my_json_folder/*.json'
$ sling run --src-conn MY_SOURCE_DB --src-stream 'source_schema.source_table' --tgt-conn MY_GS_BUCKET --tgt-object 'gs://my-bucket/my_json_folder/' --tgt-options '{file_max_rows: 100000, format: json}'
$ sling run --src-conn MY_SOURCE_DB --src-stream 'source_schema.source_table' --tgt-conn MY_AZURE_CONT --tgt-object 'https://my_account.blob.core.windows.net/my-container/my_file.json'
$ sling run --src-conn MY_SOURCE_DB --src-stream 'source_schema.source_table' --tgt-conn MY_AZURE_CONT --tgt-object 'https://my_account.blob.core.windows.net/my-container/my_json_folder/*.json'
$ sling run --src-conn MY_SOURCE_DB --src-stream 'source_schema.source_table' --tgt-conn MY_AZURE_CONT --tgt-object 'https://my_account.blob.core.windows.net/my-container/my_json_folder/' --tgt-options '{file_max_rows: 100000, format: json}'
Running with Sling: sling run -r /path/to/replication.yaml
source: MY_SOURCE_DB
target: MY_CLOUD_STORAGE
defaults:
object: {stream_schema}/{stream_table}/{YYYY}_{MM}_{DD}.json.gz
target_options:
format: json
compression: gzip
streams:
# all tables in schema
my_schema.*:
object: {stream_schema}/{stream_table}/{YYYY}_{MM}_{DD}/*.json
target_options:
file_max_rows: 400000 # will split files into folder
other_schema.source_table: # will use defaults
$ sling run --src-conn MY_SOURCE_DB --src-stream 'source_schema.source_table' --tgt-conn MY_S3_BUCKET --tgt-object 's3://my-bucket/my_file.jsonl'
$ sling run --src-conn MY_SOURCE_DB --src-stream 'source_schema.source_table' --tgt-conn MY_S3_BUCKET --tgt-object 's3://my-bucket/my_json_folder/*.jsonl'
$ sling run --src-conn MY_SOURCE_DB --src-stream 'source_schema.source_table' --tgt-conn MY_S3_BUCKET --tgt-object 's3://my-bucket/my_json_folder/' --tgt-options '{file_max_rows: 100000, format: jsonlines}'
$ sling run --src-conn MY_SOURCE_DB --src-stream 'source_schema.source_table' --tgt-conn MY_GS_BUCKET --tgt-object 'gs://my-bucket/my_file.jsonl'
$ sling run --src-conn MY_SOURCE_DB --src-stream 'source_schema.source_table' --tgt-conn MY_GS_BUCKET --tgt-object 'gs://my-bucket/my_json_folder/*.jsonl'
$ sling run --src-conn MY_SOURCE_DB --src-stream 'source_schema.source_table' --tgt-conn MY_GS_BUCKET --tgt-object 'gs://my-bucket/my_json_folder/' --tgt-options '{file_max_rows: 100000, format: jsonlines}'
$ sling run --src-conn MY_SOURCE_DB --src-stream 'source_schema.source_table' --tgt-conn MY_AZURE_CONT --tgt-object 'https://my_account.blob.core.windows.net/my-container/my_file.jsonl'
$ sling run --src-conn MY_SOURCE_DB --src-stream 'source_schema.source_table' --tgt-conn MY_AZURE_CONT --tgt-object 'https://my_account.blob.core.windows.net/my-container/my_json_folder/*.jsonl'
$ sling run --src-conn MY_SOURCE_DB --src-stream 'source_schema.source_table' --tgt-conn MY_AZURE_CONT --tgt-object 'https://my_account.blob.core.windows.net/my-container/my_json_folder/' --tgt-options '{file_max_rows: 100000, format: jsonlines}'
Running with Sling: sling run -r /path/to/replication.yaml
source: MY_SOURCE_DB
target: MY_CLOUD_STORAGE
defaults:
object: {stream_schema}/{stream_table}/{YYYY}_{MM}_{DD}.jsonl.gz
target_options:
format: jsonlines
compression: gzip
streams:
# all tables in schema, except "forbidden_table"
my_schema.*:
object: {stream_schema}/{stream_table}/{YYYY}_{MM}_{DD}/*.jsonl
target_options:
file_max_rows: 400000 # will split files into folder
my_schema.forbidden_table:
disabled: true
other_schema.source_table: # will use defaults
env:
SLING_THREADS: 3 # run streams concurrently
$ sling run --src-conn MY_SOURCE_DB --src-stream 'source_schema.source_table' --tgt-conn MY_S3_BUCKET --tgt-object 's3://my-bucket/my_file.parquet'
$ sling run --src-conn MY_SOURCE_DB --src-stream 'source_schema.source_table' --tgt-conn MY_S3_BUCKET --tgt-object 's3://my-bucket/my_parquet_folder/*.parquet'
$ sling run --src-conn MY_SOURCE_DB --src-stream 'source_schema.source_table' --tgt-conn MY_S3_BUCKET --tgt-object 's3://my-bucket/my_parquet_folder/' --tgt-options '{file_max_rows: 100000, format: parquet}'
$ sling run --src-conn MY_SOURCE_DB --src-stream 'source_schema.source_table' --tgt-conn MY_GS_BUCKET --tgt-object 'gs://my-bucket/my_file.parquet'
$ sling run --src-conn MY_SOURCE_DB --src-stream 'source_schema.source_table' --tgt-conn MY_GS_BUCKET --tgt-object 'gs://my-bucket/my_parquet_folder/*.parquet'
$ sling run --src-conn MY_SOURCE_DB --src-stream 'source_schema.source_table' --tgt-conn MY_GS_BUCKET --tgt-object 'gs://my-bucket/my_parquet_folder/' --tgt-options '{file_max_rows: 100000, format: parquet}'
$ sling run --src-conn MY_SOURCE_DB --src-stream 'source_schema.source_table' --tgt-conn MY_AZURE_CONT --tgt-object 'https://my_account.blob.core.windows.net/my-container/my_file.parquet'
$ sling run --src-conn MY_SOURCE_DB --src-stream 'source_schema.source_table' --tgt-conn MY_AZURE_CONT --tgt-object 'https://my_account.blob.core.windows.net/my-container/my_parquet_folder/*.parquet'
$ sling run --src-conn MY_SOURCE_DB --src-stream 'source_schema.source_table' --tgt-conn MY_AZURE_CONT --tgt-object 'https://my_account.blob.core.windows.net/my-container/my_parquet_folder/' --tgt-options '{file_max_rows: 100000, format: parquet}'
Running with Sling: sling run -r /path/to/replication.yaml
source: MY_SOURCE_DB
target: MY_CLOUD_STORAGE
defaults:
object: {stream_schema}/{stream_table}/{YYYY}_{MM}_{DD}.parquet
target_options:
format: parquet
streams:
# all tables in schema, except "forbidden_table"
my_schema.*:
object: {stream_schema}/{stream_table}/{YYYY}_{MM}_{DD}/*.parquet
target_options:
file_max_rows: 400000 # will split files into folder
my_schema.forbidden_table:
disabled: true
other_schema.source_table: # will use defaults
env:
SLING_THREADS: 3 # run streams concurrently
Using
Using
Using
Using
See also .
Using
Using
Using
Using
Using
Using
Using
Using
See also .
Using
Using