File to Database
Examples of using Sling to load data from databases to storage systems
We first need to make sure our connections are available in our environment. See Environment, Storage Connections and Database Connections for more details.
export MY_TARGET_DB='...'
export SAMPLE_SIZE=2000 # increase the sample size to infer types. Default is 900.
$ sling conns list
+---------------+------------------+-----------------+
| CONN NAME | CONN TYPE | SOURCE |
+---------------+------------------+-----------------+
| MY_S3_BUCKET | FileSys - S3 | sling env yaml |
| MY_TARGET_DB | DB - PostgreSQL | env variable |
| MY_GS_BUCKET | FileSys - Google | sling env yaml |
| MY_AZURE_CONT | FileSys - Azure | sling env yaml |
+---------------+------------------+-----------------+
# using windows Powershell
$env:MY_TARGET_DB = '...'
$env:SAMPLE_SIZE = 2000 # increase the sample size to infer types. Default is 900.
$ sling conns list
+---------------+------------------+-----------------+
| CONN NAME | CONN TYPE | SOURCE |
+---------------+------------------+-----------------+
| MY_S3_BUCKET | FileSys - S3 | sling env yaml |
| MY_TARGET_DB | DB - PostgreSQL | env variable |
| MY_GS_BUCKET | FileSys - Google | sling env yaml |
| MY_AZURE_CONT | FileSys - Azure | sling env yaml |
+---------------+------------------+-----------------+
Local Storage (CSV) ⇨ Database
Using CLI Flags
$ cat /tmp/my_file.csv | sling run --tgt-conn MY_TARGET_DB --tgt-object 'target_schema.target_table' --mode full-refresh
$ sling run --src-stream 'file:///tmp/my_file.csv' --src-options '{columns: { "*": string }}' --tgt-conn MY_TARGET_DB --tgt-object 'target_schema.target_table' --mode full-refresh
$ sling run --src-stream 'file:///tmp/my_csv_folder/' --src-options '{columns: {col2: string, col3: string}}' --tgt-conn MY_TARGET_DB --tgt-object 'target_schema.target_table' --mode full-refresh
$ sling run --src-stream 'file:///tmp/my_csv_folder/' --src-options '{transforms: [remove_accents]}' --tgt-conn MY_TARGET_DB --tgt-object 'target_schema.target_table' --mode full-refresh
$ sling run --src-stream 'file://C:/Temp/my_csv_folder/' --src-options '{transforms: [remove_accents]}' --tgt-conn MY_TARGET_DB --tgt-object 'target_schema.target_table' --mode full-refresh
Using Replication
Running with Sling: sling run -r /path/to/replication.yaml
source: LOCAL
target: MY_TARGET_DB
defaults:
mode: full-refresh
object: 'target_schema.{stream_file_folder}_{stream_file_name}'
streams:
# a stream with many parts, all sub files will be merged into one table
"file:///tmp/my_csv_folder/":
source_options:
format: csv
transforms: [remove_accents] # Apply transforms. Here we are removing diacritics (accents) from string values.
columns:
col2: string # cast `col2` as string
# expand all files into individual streams, each file will load into its own table
"file:///tmp/my_csv_folder/*.csv":
object: 'target_schema.{stream_file_name}'
# consider as a single stream (don't expand into individual streams)
"file:///tmp/my_csv_folder/prefix_*.csv":
object: 'target_schema.my_new_table'
single: true
"file:///tmp/my_file.csv":
source_options:
columns:
"*": string # cast all columns to string
# Windows path format
"file://C:/Temp/my_file.csv":
source_options:
columns:
"*": string # cast all columns to string
env:
SAMPLE_SIZE: 2000 # increase the sample size to infer types (default=900).
SLING_STREAM_URL_COLUMN: true # adds a _sling_stream_url column with file path
Local Storage (Excel) ⇨ Database
Using CLI Flags
$ sling run --src-stream 'file:///path/to/test.excel.xlsx' --src-options '{ sheet: "Sheet2!A:F" }' --tgt-conn MY_TARGET_DB --tgt-object 'target_schema.target_table' --mode full-refresh
Using Replication
Running with Sling: sling run -r /path/to/replication.yaml
source: LOCAL
target: MY_TARGET_DB
defaults:
mode: full-refresh
object: 'target_schema.{stream_file_folder}_{stream_file_name}'
streams:
# expand all files into a stream, each file will load into its own table
"file:///tmp/my_excel_folder/*.xlsx":
object: 'target_schema.{stream_file_name}'
source_options:
sheet: "Sheet1!A:F"
# consider as a single stream (don't expand into individual streams)
"file:///tmp/my_excel_folder/prefix_*.xlsx":
object: 'target_schema.my_new_table'
single: true
source_options:
sheet: "Sheet1!A:F"
"file:///path/to/test.excel.xlsx":
source_options:
sheet: "Sheet2!A:F"
columns:
"*": string # cast all columns to string
# Windows path format
"file://C:/Temp/my_file.xlsx":
source_options:
sheet: "Sheet2!A:F"
columns:
"col2": integer # cast col2 to integer
env:
SAMPLE_SIZE: 2000 # increase the sample size to infer types (default=900).
SLING_STREAM_URL_COLUMN: true # adds a _sling_stream_url column with file path
SLING_ROW_NUM_COLUMN: true # adds a _sling_row_num column with the row number
Local Storage (JSON) ⇨ Database
Using CLI Flags
$ cat /tmp/my_file.json | sling run --tgt-conn MY_TARGET_DB --tgt-object 'target_schema.target_table' --mode full-refresh
$ sling run --src-stream 'file:///tmp/my_file.json' --tgt-conn MY_TARGET_DB --tgt-object 'target_schema.target_table' --mode full-refresh
$ sling run --src-stream 'file:///tmp/my_json_folder/' --tgt-conn MY_TARGET_DB --tgt-object 'target_schema.target_table' --mode full-refresh
# Windows path format
$ sling run --src-stream 'file://C:/Temp/my_json_folder/' --tgt-conn MY_TARGET_DB --tgt-object 'target_schema.target_table' --mode full-refresh
Using Replication
Running with Sling: sling run -r /path/to/replication.yaml
source: LOCAL
target: MY_TARGET_DB
defaults:
mode: full-refresh
object: 'target_schema.target_table'
source_options:
format: json
streams:
"file:///tmp/my_json_folder/":
# expand all files into a stream, each file will load into its own table
"file:///tmp/my_json_folder/*.json":
object: 'target_schema.{stream_file_name}'
# consider as a single stream (don't expand into individual streams)
"file:///tmp/my_json_folder/prefix_*.json":
object: 'target_schema.my_new_table'
single: true
"file:///tmp/my_file.json":
# Windows path format
"file://C:/Temp/my_file.json":
Local Storage (JSON Flattened) ⇨ Database
Using CLI Flags
$ cat /tmp/my_file.json | sling run --src-options '{flatten: true}' --tgt-conn MY_TARGET_DB --tgt-object 'target_schema.target_table' --mode full-refresh
$ sling run --src-stream 'file:///tmp/my_file.json' --src-options '{flatten: true}' --tgt-conn MY_TARGET_DB --tgt-object 'target_schema.target_table' --mode full-refresh
$ sling run --src-stream 'file:///tmp/my_json_folder/' --src-options '{flatten: true}' --tgt-conn MY_TARGET_DB --tgt-object 'target_schema.target_table' --mode full-refresh
# Windows path format
$ sling run --src-stream 'file://C:/Temp/my_json_folder/' --src-options '{flatten: true}' --tgt-conn MY_TARGET_DB --tgt-object 'target_schema.target_table' --mode full-refresh
Using Replication
Running with Sling: sling run -r /path/to/replication.yaml
source: LOCAL
target: MY_TARGET_DB
defaults:
mode: full-refresh
object: 'target_schema.target_table'
source_options:
format: json
flatten: true
streams:
"file:///tmp/my_json_folder/":
# expand all files into a stream, each file will load into its own table
"file:///tmp/my_json_folder/*.json":
object: 'target_schema.{stream_file_name}'
# consider as a single stream (don't expand into individual streams)
"file:///tmp/my_json_folder/prefix_*.json":
object: 'target_schema.my_new_table'
single: true
"file:///tmp/my_file.json":
# Windows path format
"file://C:/Temp/my_file.json":
env:
SAMPLE_SIZE: 2000 # increase the sample size to infer types (default=900).
SLING_STREAM_URL_COLUMN: true # adds a _sling_stream_url column with file path
Local Storage (Parquet) ⇨ Database
Using CLI Flags
$ sling run --src-stream 'file:///tmp/my_file.parquet' --tgt-conn MY_TARGET_DB --tgt-object 'target_schema.target_table' --mode full-refresh
$ sling run --src-stream 'file:///tmp/my_parquet_folder/' --tgt-conn MY_TARGET_DB --tgt-object 'target_schema.target_table' --mode full-refresh
# Windows path format
$ sling run --src-stream 'file://C:/Temp/my_parquet_folder/' --tgt-conn MY_TARGET_DB --tgt-object 'target_schema.target_table' --mode full-refresh
Using Replication
Running with Sling: sling run -r /path/to/replication.yaml
source: LOCAL
target: MY_TARGET_DB
defaults:
mode: full-refresh
object: 'target_schema.target_table'
source_options:
format: parquet
streams:
"file:///tmp/my_parquet_folder/":
# expand all files into a stream, each file will load into its own table
"file:///tmp/my_parquet_folder/*.parquet":
object: 'target_schema.{stream_file_name}'
# consider as a single stream (don't expand into individual streams)
"file:///tmp/my_parquet_folder/prefix_*.parquet":
object: 'target_schema.my_new_table'
single: true
"file:///tmp/my_file.parquet":
# Windows path format
"file://C:/Temp/my_file.parquet":
Local Storage (SAS7BDAT) ⇨ Database
Using CLI Flags
$ sling run --src-stream 'file:///tmp/my_file.sas7bdat' --tgt-conn MY_TARGET_DB --tgt-object 'target_schema.target_table' --mode full-refresh
$ sling run --src-stream 'file:///tmp/my_sas7bdat_folder/' --tgt-conn MY_TARGET_DB --tgt-object 'target_schema.target_table' --mode full-refresh
# Windows path format
$ sling run --src-stream 'file://C:/tmp/my_file.sas7bdat' --tgt-conn MY_TARGET_DB --tgt-object 'target_schema.target_table' --mode full-refresh
$ sling run --src-stream 'file://C:/Temp/my_sas7bdat_folder/' --tgt-conn MY_TARGET_DB --tgt-object 'target_schema.target_table' --mode full-refresh
Using Replication
Running with Sling: sling run -r /path/to/replication.yaml
source: LOCAL
target: MY_TARGET_DB
defaults:
mode: full-refresh
object: 'target_schema.target_table'
source_options:
format: sas7bdat
streams:
"file:///tmp/my_sas7bdat_folder/":
"file:///tmp/my_file.sas7bdat":
# Windows path format
"file://C:/Temp/my_file.sas7bdat":
SFTP Storage (CSV) ⇨ Database
Using CLI Flags
$ sling run --src-conn MY_SFTP --src-stream '/path/to/my_file.csv' --src-options '{columns: { "*": string }}' --tgt-conn MY_TARGET_DB --tgt-object 'target_schema.target_table' --mode full-refresh
$ sling run --src-conn MY_SFTP --src-stream '/path/to/my_csv_folder/' --src-options '{columns: {col2: string, col3: string}}' --tgt-conn MY_TARGET_DB --tgt-object 'target_schema.target_table' --mode full-refresh
$ sling run --src-conn MY_SFTP --src-stream '/path/to/my_csv_folder/' --src-options '{transforms: [remove_accents]}' --tgt-conn MY_TARGET_DB --tgt-object 'target_schema.target_table' --mode full-refresh
Using Replication
Running with Sling: sling run -r /path/to/replication.yaml
source: MY_SFTP
target: MY_TARGET_DB
defaults:
mode: full-refresh
object: 'target_schema.{stream_file_folder}_{stream_file_name}'
streams:
"/path/to/my_csv_folder/":
source_options:
format: csv
transforms: [remove_accents] # Apply transforms. Here we are removing diacritics (accents) from string values.
columns:
col2: string # cast `col2` as string
# expand all files into a stream, each file will load into its own table
"/path/to/my_csv_folder/*.csv":
# consider as a single stream (don't expand into individual streams)
"/path/to/my_csv_folder/prefix_*.csv":
object: my_scheam.my_new_table
single: true
"/path/to/my_file.csv":
source_options:
columns:
"*": string # cast all columns to string
env:
SAMPLE_SIZE: 2000 # increase the sample size to infer types (default=900).
SLING_STREAM_URL_COLUMN: true # adds a _sling_stream_url column with file path
Cloud Storage (CSV) ⇨ Database
Using CLI Flags
$ sling run --src-conn MY_S3_BUCKET --src-stream 's3://my-bucket/my_csv_folder/' --src-options '{columns: {col2: string}}' --tgt-conn MY_TARGET_DB --tgt-object 'target_schema.target_table' --mode full-refresh
$ sling run --src-conn MY_GS_BUCKET --src-stream 'gs://my-bucket/my_csv_folder/' --src-options '{columns: {col2: string}}' --tgt-conn MY_TARGET_DB --tgt-object 'target_schema.target_table' --mode full-refresh
$ sling run --src-conn MY_AZURE_CONT --src-stream 'https://my_account.blob.core.windows.net/my-container/my_csv_folder/' --src-options '{columns: {col2: string}}' --tgt-conn MY_TARGET_DB --tgt-object 'target_schema.target_table' --mode full-refresh
$ sling run --src-conn MY_S3_BUCKET --src-stream 's3://my-bucket/my_file.csv' --src-options '{columns: {col2: string}}' --tgt-conn MY_TARGET_DB --tgt-object 'target_schema.target_table' --mode full-refresh
$ sling run --src-conn MY_GS_BUCKET --src-stream 'gs://my-bucket/my_file.csv' --src-options '{columns: {col2: string}}' --tgt-conn MY_TARGET_DB --tgt-object 'target_schema.target_table' --mode full-refresh
$ sling run --src-conn MY_AZURE_CONT --src-stream 'https://my_account.blob.core.windows.net/my-container/my_file.csv' --src-options '{columns: {col2: string}}' --tgt-conn MY_TARGET_DB --tgt-object 'target_schema.target_table' --mode full-refresh
Using Replication
Running with Sling: sling run -r /path/to/replication.yaml
source: MY_FILE_CONN
target: MY_TARGET_DB
defaults:
mode: full-refresh
object: 'target_schema.{stream_file_folder}_{stream_file_name}'
source_options:
format: csv
columns:
'*': string # cast all columns as string
streams:
# no need to specify scheme://bucket
"my_file.csv":
"my_csv_folder/": # single stream for whole folder
"my_csv_folder/*.csv": # individual streams for each file
"my_csv_folder/prefix_*.csv": # single stream for all files
single: true
"s3://my-bucket/my_csv_folder/":
"s3://my-bucket/my_csv_folder/*.csv":
"s3://my-bucket/my_csv_folder/prefix_*.csv":
single: true
"s3://my-bucket/my_file.csv":
"gs://my-bucket/my_csv_folder/":
"gs://my-bucket/my_csv_folder/*.csv":
"gs://my-bucket/my_csv_folder/prefix_*.csv":
single: true
"gs://my-bucket/my_file.csv":
env:
SAMPLE_SIZE: 2000 # increase the sample size to infer types (default=900).
SLING_STREAM_URL_COLUMN: true # adds a _sling_stream_url column with file path
Cloud Storage (JSON) ⇨ Database
Using CLI Flags
$ sling run --src-conn MY_S3_BUCKET --src-stream 's3://my-bucket/my_json_folder/' --tgt-conn MY_TARGET_DB --tgt-object 'target_schema.target_table' --mode full-refresh
$ sling run --src-conn MY_GS_BUCKET --src-stream 'gs://my-bucket/my_json_folder/' --tgt-conn MY_TARGET_DB --tgt-object 'target_schema.target_table' --mode full-refresh
$ sling run --src-conn MY_AZURE_CONT --src-stream 'https://my_account.blob.core.windows.net/my-container/my_json_folder/' --tgt-conn MY_TARGET_DB --tgt-object 'target_schema.target_table' --mode full-refresh
$ sling run --src-conn MY_S3_BUCKET --src-stream 's3://my-bucket/my_file.json' --tgt-conn MY_TARGET_DB --tgt-object 'target_schema.target_table' --mode full-refresh
$ sling run --src-conn MY_GS_BUCKET --src-stream 'gs://my-bucket/my_file.json' --tgt-conn MY_TARGET_DB --tgt-object 'target_schema.target_table' --mode full-refresh
$ sling run --src-conn MY_AZURE_CONT --src-stream 'https://my_account.blob.core.windows.net/my-container/my_file.json' --tgt-conn MY_TARGET_DB --tgt-object 'target_schema.target_table' --mode full-refresh
Using Replication
Running with Sling: sling run -r /path/to/replication.yaml
source: MY_FILE_STORAGE
target: MY_TARGET_DB
defaults:
mode: full-refresh
object: 'target_schema.{stream_file_folder}_{stream_file_name}'
source_options:
format: json
columns:
'*': string # cast all columns as string
streams:
# no need to specify scheme://bucket
"my_file.json":
"my_json_folder/": # single stream for whole folder
"my_json_folder/*.json": # individual streams for each file
"my_json_folder/prefix_*.json": # single stream for all files
single: true
"s3://my-bucket/my_json_folder/":
"s3://my-bucket/my_json_folder/*.json":
"s3://my-bucket/my_json_folder/prefix_*.json":
single: true
"s3://my-bucket/my_file.json":
"gs://my-bucket/my_json_folder/":
"gs://my-bucket/my_json_folder/*.json":
"gs://my-bucket/my_json_folder/prefix_*.json":
single: true
"gs://my-bucket/my_file.json":
Cloud Storage (JSON Flattened) ⇨ Database
Using CLI Flags
$ sling run --src-conn MY_S3_BUCKET --src-options '{flatten: true}' --src-stream 's3://my-bucket/my_json_folder/' --tgt-conn MY_TARGET_DB --tgt-object 'target_schema.target_table' --mode full-refresh
$ sling run --src-conn MY_GS_BUCKET --src-options '{flatten: true}' --src-stream 'gs://my-bucket/my_json_folder/' --tgt-conn MY_TARGET_DB --tgt-object 'target_schema.target_table' --mode full-refresh
$ sling run --src-conn MY_AZURE_CONT --src-options '{flatten: true}' --src-stream 'https://my_account.blob.core.windows.net/my-container/my_json_folder/' --tgt-conn MY_TARGET_DB --tgt-object 'target_schema.target_table' --mode full-refresh
$ sling run --src-conn MY_S3_BUCKET --src-options '{flatten: true}' --src-stream 's3://my-bucket/my_file.json' --tgt-conn MY_TARGET_DB --tgt-object 'target_schema.target_table' --mode full-refresh
$ sling run --src-conn MY_GS_BUCKET --src-options '{flatten: true}' --src-stream 'gs://my-bucket/my_file.json' --tgt-conn MY_TARGET_DB --tgt-object 'target_schema.target_table' --mode full-refresh
$ sling run --src-conn MY_AZURE_CONT --src-options '{flatten: true}' --src-stream 'https://my_account.blob.core.windows.net/my-container/my_file.json' --tgt-conn MY_TARGET_DB --tgt-object 'target_schema.target_table' --mode full-refresh
Using Replication
Running with Sling: sling run -r /path/to/replication.yaml
source: MY_FILE_STORAGE
target: MY_TARGET_DB
defaults:
mode: full-refresh
object: 'target_schema.{stream_file_folder}_{stream_file_name}'
source_options:
format: json
flatten: true
columns:
'*': string # cast all columns as string
streams:
# no need to specify scheme://bucket
"my_json_folder/": # single stream for whole folder
"my_json_folder/*.json": # individual streams for each file
"my_json_folder/prefix_*.json": # single stream for all files
single: true
"s3://my-bucket/my_json_folder/":
"s3://my-bucket/my_json_folder/*.json":
"s3://my-bucket/my_json_folder/prefix_*.json":
single: true
"s3://my-bucket/my_file.json":
"gs://my-bucket/my_json_folder/":
"gs://my-bucket/my_json_folder/*.json":
"gs://my-bucket/my_json_folder/prefix_*.json":
single: true
"gs://my-bucket/my_file.json":
env:
SAMPLE_SIZE: 2000 # increase the sample size to infer types (default=900).
SLING_STREAM_URL_COLUMN: true # adds a _sling_stream_url column with file path
Cloud Storage (Parquet) ⇨ Database
Using CLI Flags
$ sling run --src-conn MY_S3_BUCKET --src-stream 's3://my-bucket/my_parquet_folder/' --tgt-conn MY_TARGET_DB --tgt-object 'target_schema.target_table' --mode full-refresh
$ sling run --src-conn MY_GS_BUCKET --src-stream 'gs://my-bucket/my_parquet_folder/' --tgt-conn MY_TARGET_DB --tgt-object 'target_schema.target_table' --mode full-refresh
$ sling run --src-conn MY_AZURE_CONT --src-stream 'https://my_account.blob.core.windows.net/my-container/my_parquet_folder/' --tgt-conn MY_TARGET_DB --tgt-object 'target_schema.target_table' --mode full-refresh
$ sling run --src-conn MY_S3_BUCKET --src-stream 's3://my-bucket/my_file.parquet' --tgt-conn MY_TARGET_DB --tgt-object 'target_schema.target_table' --mode full-refresh
$ sling run --src-conn MY_GS_BUCKET --src-stream 'gs://my-bucket/my_file.parquet' --tgt-conn MY_TARGET_DB --tgt-object 'target_schema.target_table' --mode full-refresh
$ sling run --src-conn MY_AZURE_CONT --src-stream 'https://my_account.blob.core.windows.net/my-container/my_file.parquet' --tgt-conn MY_TARGET_DB --tgt-object 'target_schema.target_table' --mode full-refresh
Using Replication
Running with Sling: sling run -r /path/to/replication.yaml
source: MY_FILE_STORAGE
target: MY_TARGET_DB
defaults:
mode: full-refresh
object: 'target_schema.{stream_file_folder}_{stream_file_name}'
source_options:
format: parquet
streams:
# no need to specify scheme://bucket
"my_file.parquet":
"my_parquet_folder/": # single stream for whole folder
"my_parquet_folder/*.parquet": # one stream for each file
"my_parquet_folder/prefix_*.parquet": # single stream for all files
single: true
"s3://my-bucket/my_parquet_folder/":
"s3://my-bucket/my_parquet_folder/*.parquet":
"s3://my-bucket/my_parquet_folder/prefix_*.parquet":
single: true
"s3://my-bucket/my_file.parquet":
"gs://my-bucket/my_parquet_folder/":
"gs://my-bucket/my_parquet_folder/*.parquet":
"gs://my-bucket/my_parquet_folder/prefix_*.parquet":
single: true
"gs://my-bucket/my_file.parquet":
env:
SLING_STREAM_URL_COLUMN: true # adds a _sling_stream_url column with file path
Cloud Storage (Avro) ⇨ Database
Using CLI Flags
$ sling run --src-conn MY_S3_BUCKET --src-stream 's3://my-bucket/my_avro_folder/' --tgt-conn MY_TARGET_DB --tgt-object 'target_schema.target_table' --mode full-refresh
$ sling run --src-conn MY_GS_BUCKET --src-stream 'gs://my-bucket/my_avro_folder/' --tgt-conn MY_TARGET_DB --tgt-object 'target_schema.target_table' --mode full-refresh
$ sling run --src-conn MY_AZURE_CONT --src-stream 'https://my_account.blob.core.windows.net/my-container/my_avro_folder/' --tgt-conn MY_TARGET_DB --tgt-object 'target_schema.target_table' --mode full-refresh
$ sling run --src-conn MY_S3_BUCKET --src-stream 's3://my-bucket/my_file.avro' --tgt-conn MY_TARGET_DB --tgt-object 'target_schema.target_table' --mode full-refresh
$ sling run --src-conn MY_GS_BUCKET --src-stream 'gs://my-bucket/my_file.avro' --tgt-conn MY_TARGET_DB --tgt-object 'target_schema.target_table' --mode full-refresh
$ sling run --src-conn MY_AZURE_CONT --src-stream 'https://my_account.blob.core.windows.net/my-container/my_file.avro' --tgt-conn MY_TARGET_DB --tgt-object 'target_schema.target_table' --mode full-refresh
Using Replication
Running with Sling: sling run -r /path/to/replication.yaml
source: MY_FILE_STORAGE
target: MY_TARGET_DB
defaults:
mode: full-refresh
object: 'target_schema.{stream_file_folder}_{stream_file_name}'
source_options:
format: avro
streams:
# no need to specify scheme://bucket
"my_file.avro":
"my_avro_folder/": # single stream for whole folder
"my_avro_folder/*.avro": # one stream for each file
"my_avro_folder/prefix_*.avro": # single stream for all files
single: true
"s3://my-bucket/my_avro_folder/":
"s3://my-bucket/my_avro_folder/*.avro":
"s3://my-bucket/my_avro_folder/prefix_*.avro":
single: true
"s3://my-bucket/my_file.avro":
"gs://my-bucket/my_avro_folder/":
"gs://my-bucket/my_avro_folder/*.avro":
"gs://my-bucket/my_avro_folder/prefix_*.avro":
single: true
"gs://my-bucket/my_file.avro":
env:
SLING_STREAM_URL_COLUMN: true # adds a _sling_stream_url column with file path
Cloud Storage (XML) ⇨ Database
Using CLI Flags
$ sling run --src-conn MY_S3_BUCKET --src-stream 's3://my-bucket/my_xml_folder/' --tgt-conn MY_TARGET_DB --tgt-object 'target_schema.target_table' --mode full-refresh
$ sling run --src-conn MY_GS_BUCKET --src-stream 'gs://my-bucket/my_xml_folder/' --tgt-conn MY_TARGET_DB --tgt-object 'target_schema.target_table' --mode full-refresh
$ sling run --src-conn MY_AZURE_CONT --src-stream 'https://my_account.blob.core.windows.net/my-container/my_xml_folder/' --tgt-conn MY_TARGET_DB --tgt-object 'target_schema.target_table' --mode full-refresh
$ sling run --src-conn MY_S3_BUCKET --src-stream 's3://my-bucket/my_file.xml' --tgt-conn MY_TARGET_DB --tgt-object 'target_schema.target_table' --mode full-refresh
$ sling run --src-conn MY_GS_BUCKET --src-stream 'gs://my-bucket/my_file.xml' --tgt-conn MY_TARGET_DB --tgt-object 'target_schema.target_table' --mode full-refresh
$ sling run --src-conn MY_AZURE_CONT --src-stream 'https://my_account.blob.core.windows.net/my-container/my_file.xml' --tgt-conn MY_TARGET_DB --tgt-object 'target_schema.target_table' --mode full-refresh
Using Replication
Running with Sling: sling run -r /path/to/replication.yaml
source: MY_FILE_STORAGE
target: MY_TARGET_DB
defaults:
mode: full-refresh
object: 'target_schema.{stream_file_folder}_{stream_file_name}'
source_options:
format: xml
streams:
# no need to specify scheme://bucket
"my_xml_folder/":
"my_file.xml":
"s3://my-bucket/my_xml_folder/":
"s3://my-bucket/my_file.xml":
"gs://my-bucket/my_xml_folder/":
"gs://my-bucket/my_file.xml":
env:
SLING_STREAM_URL_COLUMN: true # adds a _sling_stream_url column with file path
Last updated