I have a table with a primary key on shared storage and use Vector to read messages from Kafka and push data via HTTP Stream Load API.
However, noticed that that created a huge amount of small files in S3. Also, capacity usage is like 20x of the size of data (as shown by “SHOW DATA”)
Data rate is ~150 000 messages per sec, though messages are quite small (few kb)
Buckets are automatic, “show partitions” reports 12 buckets per partition.
Is that expected? within 14 days we have around 50M files and 90TB of data in S3, though table was way below 10TB
My table:
CREATE TABLE `nginx_core_log` (
`request_id` varchar(255) NOT NULL COMMENT "",
`timestamp` datetime NOT NULL COMMENT "",
`msec` double NULL COMMENT "",
`host` varchar(255) NULL COMMENT "",
`http_referer` varchar(65533) NULL COMMENT "",
`http_user_agent` varchar(65533) NULL COMMENT "",
`nginx_host` varchar(255) NULL COMMENT "",
`request_length` bigint(20) NULL COMMENT "",
`request_time` double NULL COMMENT "",
`scheme` varchar(255) NULL COMMENT "",
`service` varchar(255) NULL COMMENT "",
`status` int(11) NULL COMMENT "",
`raw` json NULL COMMENT ""
) ENGINE=OLAP
PRIMARY KEY(`request_id`, `timestamp`)
COMMENT "OLAP"
PARTITION BY date_trunc('day', timestamp)
DISTRIBUTED BY HASH(`request_id`)
ORDER BY(`timestamp`)
PROPERTIES (
"compression" = "ZSTD",
"datacache.enable" = "true",
"enable_async_write_back" = "false",
"enable_persistent_index" = "true",
"partition_live_number" = "14",
"persistent_index_type" = "LOCAL",
"replication_num" = "1",
"storage_volume" = "builtin_storage_volume"
);
Vector sink
sinks:
starrocks-api:
type: "http"
inputs: ["nginx_core_log"]
uri: "sr-backend:8040/api/logging/nginx_core_log/_stream_load"
method: put
auth:
strategy: basic
user: root
password: "<password>"
request:
headers:
format: json
jsonpaths: '["$$.request_id", "$$.timestamp", "$$.msec", "$$.host", "$$.http_referer", "$$.http_user_agent", "$$.nginx_host", "$$.request_length", "$$.request_time", "$$.scheme", "$$.service", "$$.status", "$$"]'
columns: "request_id, timestamp, msec, host, http_referer, http_user_agent, nginx_host, request_length, request_time, scheme, service, status, raw"
ignore_json_size: "true"
expect: "100-continue"
batch:
max_events: 1000
timeout_secs: 15
buffer:
type: disk
max_size: 268435488
framing:
method: "newline_delimited"
encoding:
codec: "json"
mysql> show partitions from logging.nginx_core_log \G
*************************** 1. row ***************************
PartitionId: 176059
PartitionName: p20241101
CompactVersion: 5375
VisibleVersion: 5375
NextVersion: 5376
State: NORMAL
PartitionKey: timestamp
Range: [types: [DATETIME]; keys: [2024-11-01 00:00:00]; ..types: [DATETIME]; keys: [2024-11-02 00:00:00]; )
DistributionKey: request_id
Buckets: 12
DataSize: 2TB
RowCount: 126538858
EnableDataCache: true
AsyncWrite: false
AvgCS: 7.00
P50CS: 7.00
MaxCS: 7.00
DataVersion: 4706
VersionEpoch: 319994016382844928
VersionTxnType: TXN_NORMAL
*************************** 2. row ***************************
PartitionId: 199206
PartitionName: p20241102
CompactVersion: 2244
VisibleVersion: 2244
NextVersion: 2245
State: NORMAL
PartitionKey: timestamp
Range: [types: [DATETIME]; keys: [2024-11-02 00:00:00]; ..types: [DATETIME]; keys: [2024-11-03 00:00:00]; )
DistributionKey: request_id
Buckets: 12
DataSize: 231.2GB
RowCount: 126538858
EnableDataCache: true
AsyncWrite: false
AvgCS: 7.17
P50CS: 8.00
MaxCS: 9.00
DataVersion: 1882
VersionEpoch: 320237876795146240
VersionTxnType: TXN_NORMAL
*************************** 3. row ***************************
PartitionId: 234470
PartitionName: p20241104
CompactVersion: 61
VisibleVersion: 61
NextVersion: 62
State: NORMAL
PartitionKey: timestamp
Range: [types: [DATETIME]; keys: [2024-11-04 00:00:00]; ..types: [DATETIME]; keys: [2024-11-05 00:00:00]; )
DistributionKey: request_id
Buckets: 12
DataSize: 64GB
RowCount: 126538858
EnableDataCache: true
AsyncWrite: false
AvgCS: 4.00
P50CS: 4.00
MaxCS: 4.00
DataVersion: 56
VersionEpoch: 320616720750346240
VersionTxnType: TXN_NORMAL
*************************** 4. row ***************************
PartitionId: 267434
PartitionName: p20241106
CompactVersion: 122463
VisibleVersion: 129647
NextVersion: 129773
State: NORMAL
PartitionKey: timestamp
Range: [types: [DATETIME]; keys: [2024-11-06 00:00:00]; ..types: [DATETIME]; keys: [2024-11-07 00:00:00]; )
DistributionKey: request_id
Buckets: 12
DataSize: 55.7GB
RowCount: 126538858
EnableDataCache: true
AsyncWrite: false
AvgCS: 68006.67
P50CS: 68007.00
MaxCS: 68029.00
DataVersion: 129622
VersionEpoch: 320996485351079936
VersionTxnType: TXN_NORMAL
4 rows in set (0.04 sec)