技術2025年11月23日
Loki 相關筆記
Loki 部署筆記 [未整理]
Loki
- storageclass
apiVersion: storage.k8s.io/v1
kind: StorageClass
metadata:
name: ebs-sc-log
provisioner: ebs.csi.eks.amazonaws.com
parameters:
type: gp3
reclaimPolicy: Retain
allowVolumeExpansion: true
volumeBindingMode: Immediate
allowedTopologies:
- matchLabelExpressions:
- key: topology.kubernetes.io/zone
values:
- us-east-1a
- us-east-1c
- us-east-1d
- values.yaml
deploymentMode: Distributed
# configuration for Loki with S3 storage
serviceAccount:
annotations:
# 要先建置service role
# policy 要incude s3 bueckt 存取權限
"eks.amazonaws.com/role-arn": "arn:aws:iam::354950674167:role/bit-au-prod-nv-loki-service-role"
# Main Loki configuration
loki:
# Schema configuration for TSDB with S3
schemaConfig:
configs:
- from: 2024-04-01
store: tsdb
object_store: s3
schema: v13
index:
prefix: bit_prod_au_hk_loki_index_
period: 24h
# Server configuration
server:
http_listen_port: 3100
log_level: info
log_format: json
http_server_read_timeout: 1m
http_server_write_timeout: 1m
# Authentication disabled
auth_enabled: false
# Common configuration (this will be merged with storage config automatically)
commonConfig:
path_prefix: /var/loki
replication_factor: 3
limits_config:
# 查詢性能優化
split_queries_by_interval: 5m
query_timeout: 5m
max_query_parallelism: 64
max_cache_freshness_per_query: 1m
max_query_series: 5000
max_query_length: 24h
# 保持你的其他現有配置不變
reject_old_samples: true
reject_old_samples_max_age: 168h
creation_grace_period: 10m
ingestion_rate_mb: 50
ingestion_burst_size_mb: 100
max_line_size: 1MB
max_line_size_truncate: true
max_label_name_length: 1024
max_label_value_length: 4096
max_label_names_per_series: 30
max_streams_per_user: 100000
max_global_streams_per_user: 100000
per_stream_rate_limit: 10MB
per_stream_rate_limit_burst: 20MB
volume_enabled: true
unordered_writes: true
use_owned_stream_count: false
# Ingester configuration
ingester:
chunk_encoding: snappy
chunk_target_size: 1048576 # 1MB
chunk_idle_period: 30m
max_chunk_age: 2h
chunk_retain_period: 1m
wal:
dir: /var/loki/wal
lifecycler:
ring:
kvstore:
store: memberlist
replication_factor: 3
heartbeat_timeout: 1m
zone_awareness_enabled: false
# Distributor configuration
distributor:
ring:
kvstore:
store: memberlist
# Querier configuration
querier:
max_concurrent: 32
tail_max_duration: 1h
# Storage configuration
# s3 這定檔在這
storage:
type: s3
bucketNames:
chunks: "bit-prod-au-hk-loki-data"
ruler: "bit-prod-au-hk-loki-data"
admin: "bit-prod-au-hk-loki-data"
s3:
region: ap-east-1
s3ForcePathStyle: false
insecure: false
http_config:
idle_conn_timeout: 90s
response_header_timeout: 0s
insecure_skip_verify: false
# Storage config for TSDB
storage_config:
tsdb_shipper:
active_index_directory: /var/loki/tsdb-index
cache_location: /var/loki/tsdb-cache
cache_ttl: 24h
# Compactor configuration
compactor:
working_directory: /tmp/loki/compactor
compaction_interval: 10m
retention_enabled: true # 如果不需要 retention,設為 false
retention_delete_delay: 2h
retention_delete_worker_count: 150
delete_request_store: s3 # 只有當 retention_enabled=true 時才需要
# Frontend configuration
frontend:
log_queries_longer_than: 5s
compress_responses: true
max_outstanding_per_tenant: 2048
# Ruler configuration (internal)
ruler:
storage:
type: s3
rule_path: /var/loki/rules-temp
wal:
dir: /var/loki/ruler-wal
ring:
kvstore:
store: memberlist
enable_api: true
# Tracing
tracing:
enabled: true
# Index gateway configuration
index_gateway:
mode: simple
# Memberlist configuration
memberlist:
node_name: ""
randomize_node_name: true
stream_timeout: 10s
retransmit_mult: 4
push_pull_interval: 30s
gossip_interval: 200ms
gossip_nodes: 3
gossip_to_dead_time: 30s
dead_node_reclaim_time: 0s
compression_enabled: true
# Component-specific configurations
ingester:
replicas: 6
resources:
requests:
cpu: 2
memory: 2560Mi
limits:
cpu: 2
memory: 2560Mi
podDisruptionBudget:
maxUnavailable: 1
nodeSelector:
karpenter.sh/nodepool: bit-au-ops-node-pool
querier:
replicas: 6
maxUnavailable: 1
resources:
requests:
cpu: 3 # 從 2 增加到 3
memory: 6Gi # 從 4Gi 增加到 6Gi
limits:
cpu: 3
memory: 6Gi
podDisruptionBudget:
maxUnavailable: 1
nodeSelector:
karpenter.sh/nodepool: bit-au-ops-node-pool
queryFrontend:
replicas: 6
maxUnavailable: 1
resources:
requests:
cpu: 3 # 從 2 增加到 3
memory: 6Gi # 從 4Gi 增加到 6Gi
limits:
cpu: 3
memory: 6Gi
podDisruptionBudget:
maxUnavailable: 1
nodeSelector:
karpenter.sh/nodepool: bit-au-ops-node-pool
queryScheduler:
replicas: 6
resources:
requests:
cpu: 3
memory: 6Gi
limits:
cpu: 3
memory: 6Gi
nodeSelector:
karpenter.sh/nodepool: bit-au-ops-node-pool
distributor:
replicas: 3
maxUnavailable: 1
resources:
requests:
cpu: 2
memory: 2Gi
limits:
cpu: 2
memory: 2Gi
podDisruptionBudget:
maxUnavailable: 1
nodeSelector:
karpenter.sh/nodepool: bit-au-ops-node-pool
compactor:
replicas: 3
resources:
requests:
cpu: 2
memory: 2Gi
limits:
cpu: 2
memory: 2Gi
nodeSelector:
karpenter.sh/nodepool: bit-au-ops-node-pool
persistence:
size: 20Gi
storageClass: ebs-sc-log
indexGateway:
replicas: 3
maxUnavailable: 1
resources:
requests:
cpu: 2
memory: 2Gi
limits:
cpu: 2
memory: 2Gi
nodeSelector:
karpenter.sh/nodepool: bit-au-ops-node-pool
podDisruptionBudget:
maxUnavailable: 1
# Cache configurations
# Ruler configuration
ruler:
enabled: true
replicas: 1
resources:
requests:
cpu: 100m
memory: 256Mi
limits:
cpu: 200m
memory: 512Mi
# Disabled components (using distributed mode)
bloomCompactor:
replicas: 0
bloomGateway:
replicas: 0
backend:
replicas: 0
read:
replicas: 0
write:
replicas: 0
singleBinary:
replicas: 0
resultsCache:
enabled: true
resources:
requests:
memory: "2Gi" # 從 512Mi 增加
cpu: "500m" # 從 200m 增加
limits:
memory: "4Gi" # 從 1Gi 增加
cpu: "1000m" # 從 500m 增加
chunksCache:
enabled: true
replicas: 3
resources:
requests:
cpu: 3 # 從 2 增加
memory: 8Gi # 從 4Gi 增加
limits:
cpu: 3
memory: 8Gi
# Disable minio (using S3)
minio:
enabled: false
# Gateway configuration (optional - for external access)
gateway:
enabled: true
replicas: 6
resources:
requests:
cpu: 2
memory: 2Gi
limits:
cpu: 2
memory: 2Gi
nodeSelector:
karpenter.sh/nodepool: bit-au-ops-node-pool
ingress:
enabled: false # Set to true if you need external access
monitoring:
serviceMonitor:
enabled: true
labels:
app: loki
release: prometheus
interval: 30s
scrapeTimeout: 10s
metricRelabelings: []
relabelings: []
prometheusRule:
enabled: true
groups:
- name: loki-alerts
rules:
- alert: LokiIngestionRateHigh
expr: rate(loki_distributor_ingester_append_failures_total[5m]) > 0.1
for: 5m
labels:
severity: warning
annotations:
summary: "Loki ingestion failure rate is high"
description: "Loki ingestion failure rate is {{ $value }} per second"
- alert: LokiRequestLatencyHigh
expr: histogram_quantile(0.99, rate(loki_request_duration_seconds_bucket[5m])) > 1
for: 5m
labels:
severity: warning
annotations:
summary: "Loki request latency is high"
description: "Loki 99th percentile latency is {{ $value }}s"
- alert: LokiIngesterDown
expr: up{job=~"loki/ingester"} == 0
for: 5m
labels:
severity: critical
annotations:
summary: "Loki ingester is down"
description: "Loki ingester {{ $labels.instance }} has been down for more than 5 minutes"
- alert: LokiDistributorDown
expr: up{job=~"loki/distributor"} == 0
for: 5m
labels:
severity: critical
annotations:
summary: "Loki distributor is down"
description: "Loki distributor {{ $labels.instance }} has been down for more than 5 minutes"
# Network policies (optional security enhancement)
networkPolicy:
enabled: false # Set to true if you want network isolation
# egress:
# enabled: true
# ports:
# - port: 443
# protocol: TCP
# - port: 53
# protocol: UDP
# ingress:
# enabled: true
# namespaceSelector: {}
# Pod Security Context
securityContext:
runAsNonRoot: true
runAsUser: 10001
runAsGroup: 10001
fsGroup: 10001
# Container Security Context
containerSecurityContext:
readOnlyRootFilesystem: false # 允許寫入 /var/loki
capabilities:
drop:
- ALL
allowPrivilegeEscalation: false
# Resource quotas and limits (optional)
# rbac:
# pspEnabled: false
# Enterprise features (if using Grafana Enterprise Logs)
enterprise:
enabled: false
# Test configuration
test:
enabled: true
prometheusAddress: "http://prometheus-stack-kube-prom-prometheus.monitoring.svc.cluster.local:9090"
# Global settings
global:
# Pod annotations
podAnnotations:
prometheus.io/scrape: "true"
prometheus.io/port: "3100"
prometheus.io/path: "/metrics"
# Image settings
image:
registry: docker.io
pullPolicy: IfNotPresent
# Priority class (optional)
# priorityClassName: "high-priority"
# Grafana Agent configuration (if using)
grafanaAgent:
enabled: false
Install
helm repo add grafana https://grafana.github.io/helm-charts
helm upgrade --install --values values.yaml loki grafana/loki -n logs --create-namespace
標籤
KubernetesLogsLoki