Add snapshot interval
This commit adds the ability to periodically perform snapshots on an interval that is separate from retention. For example, this lets you retain backups for 24 hours but you can snapshot your database every six hours to improve recovery time.
This commit is contained in:
@@ -243,6 +243,7 @@ type ReplicaConfig struct {
|
|||||||
Retention time.Duration `yaml:"retention"`
|
Retention time.Duration `yaml:"retention"`
|
||||||
RetentionCheckInterval time.Duration `yaml:"retention-check-interval"`
|
RetentionCheckInterval time.Duration `yaml:"retention-check-interval"`
|
||||||
SyncInterval time.Duration `yaml:"sync-interval"` // s3 only
|
SyncInterval time.Duration `yaml:"sync-interval"` // s3 only
|
||||||
|
SnapshotInterval time.Duration `yaml:"snapshot-interval"`
|
||||||
ValidationInterval time.Duration `yaml:"validation-interval"`
|
ValidationInterval time.Duration `yaml:"validation-interval"`
|
||||||
|
|
||||||
// S3 settings
|
// S3 settings
|
||||||
@@ -304,6 +305,9 @@ func newFileReplicaFromConfig(c *ReplicaConfig, db *litestream.DB) (_ *litestrea
|
|||||||
if v := c.RetentionCheckInterval; v > 0 {
|
if v := c.RetentionCheckInterval; v > 0 {
|
||||||
r.RetentionCheckInterval = v
|
r.RetentionCheckInterval = v
|
||||||
}
|
}
|
||||||
|
if v := c.SnapshotInterval; v > 0 {
|
||||||
|
r.SnapshotInterval = v
|
||||||
|
}
|
||||||
if v := c.ValidationInterval; v > 0 {
|
if v := c.ValidationInterval; v > 0 {
|
||||||
r.ValidationInterval = v
|
r.ValidationInterval = v
|
||||||
}
|
}
|
||||||
@@ -372,6 +376,9 @@ func newS3ReplicaFromConfig(c *ReplicaConfig, db *litestream.DB) (_ *s3.Replica,
|
|||||||
if v := c.SyncInterval; v > 0 {
|
if v := c.SyncInterval; v > 0 {
|
||||||
r.SyncInterval = v
|
r.SyncInterval = v
|
||||||
}
|
}
|
||||||
|
if v := c.SnapshotInterval; v > 0 {
|
||||||
|
r.SnapshotInterval = v
|
||||||
|
}
|
||||||
if v := c.ValidationInterval; v > 0 {
|
if v := c.ValidationInterval; v > 0 {
|
||||||
r.ValidationInterval = v
|
r.ValidationInterval = v
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -36,6 +36,7 @@ const (
|
|||||||
|
|
||||||
// Litestream errors.
|
// Litestream errors.
|
||||||
var (
|
var (
|
||||||
|
ErrNoGeneration = errors.New("no generation available")
|
||||||
ErrNoSnapshots = errors.New("no snapshots available")
|
ErrNoSnapshots = errors.New("no snapshots available")
|
||||||
ErrChecksumMismatch = errors.New("invalid replica, checksum mismatch")
|
ErrChecksumMismatch = errors.New("invalid replica, checksum mismatch")
|
||||||
)
|
)
|
||||||
|
|||||||
57
replica.go
57
replica.go
@@ -98,8 +98,11 @@ type FileReplica struct {
|
|||||||
walIndexGauge prometheus.Gauge
|
walIndexGauge prometheus.Gauge
|
||||||
walOffsetGauge prometheus.Gauge
|
walOffsetGauge prometheus.Gauge
|
||||||
|
|
||||||
|
// Frequency to create new snapshots.
|
||||||
|
SnapshotInterval time.Duration
|
||||||
|
|
||||||
// Time to keep snapshots and related WAL files.
|
// Time to keep snapshots and related WAL files.
|
||||||
// Database is snapshotted after interval and older WAL files are discarded.
|
// Database is snapshotted after interval, if needed, and older WAL files are discarded.
|
||||||
Retention time.Duration
|
Retention time.Duration
|
||||||
|
|
||||||
// Time between checks for retention.
|
// Time between checks for retention.
|
||||||
@@ -402,9 +405,10 @@ func (r *FileReplica) Start(ctx context.Context) {
|
|||||||
ctx, r.cancel = context.WithCancel(ctx)
|
ctx, r.cancel = context.WithCancel(ctx)
|
||||||
|
|
||||||
// Start goroutine to replicate data.
|
// Start goroutine to replicate data.
|
||||||
r.wg.Add(3)
|
r.wg.Add(4)
|
||||||
go func() { defer r.wg.Done(); r.monitor(ctx) }()
|
go func() { defer r.wg.Done(); r.monitor(ctx) }()
|
||||||
go func() { defer r.wg.Done(); r.retainer(ctx) }()
|
go func() { defer r.wg.Done(); r.retainer(ctx) }()
|
||||||
|
go func() { defer r.wg.Done(); r.snapshotter(ctx) }()
|
||||||
go func() { defer r.wg.Done(); r.validator(ctx) }()
|
go func() { defer r.wg.Done(); r.validator(ctx) }()
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -446,7 +450,18 @@ func (r *FileReplica) monitor(ctx context.Context) {
|
|||||||
|
|
||||||
// retainer runs in a separate goroutine and handles retention.
|
// retainer runs in a separate goroutine and handles retention.
|
||||||
func (r *FileReplica) retainer(ctx context.Context) {
|
func (r *FileReplica) retainer(ctx context.Context) {
|
||||||
ticker := time.NewTicker(r.RetentionCheckInterval)
|
// Disable retention enforcement if retention period is non-positive.
|
||||||
|
if r.Retention <= 0 {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
// Ensure check interval is not longer than retention period.
|
||||||
|
checkInterval := r.RetentionCheckInterval
|
||||||
|
if checkInterval > r.Retention {
|
||||||
|
checkInterval = r.Retention
|
||||||
|
}
|
||||||
|
|
||||||
|
ticker := time.NewTicker(checkInterval)
|
||||||
defer ticker.Stop()
|
defer ticker.Stop()
|
||||||
|
|
||||||
for {
|
for {
|
||||||
@@ -462,6 +477,28 @@ func (r *FileReplica) retainer(ctx context.Context) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// snapshotter runs in a separate goroutine and handles snapshotting.
|
||||||
|
func (r *FileReplica) snapshotter(ctx context.Context) {
|
||||||
|
if r.SnapshotInterval <= 0 {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
ticker := time.NewTicker(r.SnapshotInterval)
|
||||||
|
defer ticker.Stop()
|
||||||
|
|
||||||
|
for {
|
||||||
|
select {
|
||||||
|
case <-ctx.Done():
|
||||||
|
return
|
||||||
|
case <-ticker.C:
|
||||||
|
if err := r.Snapshot(ctx); err != nil && err != ErrNoGeneration {
|
||||||
|
log.Printf("%s(%s): snapshotter error: %s", r.db.Path(), r.Name(), err)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// validator runs in a separate goroutine and handles periodic validation.
|
// validator runs in a separate goroutine and handles periodic validation.
|
||||||
func (r *FileReplica) validator(ctx context.Context) {
|
func (r *FileReplica) validator(ctx context.Context) {
|
||||||
// Initialize counters since validation occurs infrequently.
|
// Initialize counters since validation occurs infrequently.
|
||||||
@@ -531,6 +568,18 @@ func (r *FileReplica) CalcPos(ctx context.Context, generation string) (pos Pos,
|
|||||||
return pos, nil
|
return pos, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Snapshot copies the entire database to the replica path.
|
||||||
|
func (r *FileReplica) Snapshot(ctx context.Context) error {
|
||||||
|
// Find current position of database.
|
||||||
|
pos, err := r.db.Pos()
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("cannot determine current db generation: %w", err)
|
||||||
|
} else if pos.IsZero() {
|
||||||
|
return ErrNoGeneration
|
||||||
|
}
|
||||||
|
return r.snapshot(ctx, pos.Generation, pos.Index)
|
||||||
|
}
|
||||||
|
|
||||||
// snapshot copies the entire database to the replica path.
|
// snapshot copies the entire database to the replica path.
|
||||||
func (r *FileReplica) snapshot(ctx context.Context, generation string, index int) error {
|
func (r *FileReplica) snapshot(ctx context.Context, generation string, index int) error {
|
||||||
// Acquire a read lock on the database during snapshot to prevent checkpoints.
|
// Acquire a read lock on the database during snapshot to prevent checkpoints.
|
||||||
@@ -557,7 +606,7 @@ func (r *FileReplica) snapshot(ctx context.Context, generation string, index int
|
|||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
log.Printf("%s(%s): snapshot: creating %s/%08x t=%s", r.db.Path(), r.Name(), generation, index, time.Since(startTime))
|
log.Printf("%s(%s): snapshot: creating %s/%08x t=%s", r.db.Path(), r.Name(), generation, index, time.Since(startTime).Truncate(time.Millisecond))
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
55
s3/s3.go
55
s3/s3.go
@@ -80,6 +80,9 @@ type Replica struct {
|
|||||||
// Time between syncs with the shadow WAL.
|
// Time between syncs with the shadow WAL.
|
||||||
SyncInterval time.Duration
|
SyncInterval time.Duration
|
||||||
|
|
||||||
|
// Frequency to create new snapshots.
|
||||||
|
SnapshotInterval time.Duration
|
||||||
|
|
||||||
// Time to keep snapshots and related WAL files.
|
// Time to keep snapshots and related WAL files.
|
||||||
// Database is snapshotted after interval and older WAL files are discarded.
|
// Database is snapshotted after interval and older WAL files are discarded.
|
||||||
Retention time.Duration
|
Retention time.Duration
|
||||||
@@ -427,9 +430,10 @@ func (r *Replica) Start(ctx context.Context) {
|
|||||||
ctx, r.cancel = context.WithCancel(ctx)
|
ctx, r.cancel = context.WithCancel(ctx)
|
||||||
|
|
||||||
// Start goroutines to manage replica data.
|
// Start goroutines to manage replica data.
|
||||||
r.wg.Add(3)
|
r.wg.Add(4)
|
||||||
go func() { defer r.wg.Done(); r.monitor(ctx) }()
|
go func() { defer r.wg.Done(); r.monitor(ctx) }()
|
||||||
go func() { defer r.wg.Done(); r.retainer(ctx) }()
|
go func() { defer r.wg.Done(); r.retainer(ctx) }()
|
||||||
|
go func() { defer r.wg.Done(); r.snapshotter(ctx) }()
|
||||||
go func() { defer r.wg.Done(); r.validator(ctx) }()
|
go func() { defer r.wg.Done(); r.validator(ctx) }()
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -479,7 +483,18 @@ func (r *Replica) monitor(ctx context.Context) {
|
|||||||
|
|
||||||
// retainer runs in a separate goroutine and handles retention.
|
// retainer runs in a separate goroutine and handles retention.
|
||||||
func (r *Replica) retainer(ctx context.Context) {
|
func (r *Replica) retainer(ctx context.Context) {
|
||||||
ticker := time.NewTicker(r.RetentionCheckInterval)
|
// Disable retention enforcement if retention period is non-positive.
|
||||||
|
if r.Retention <= 0 {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
// Ensure check interval is not longer than retention period.
|
||||||
|
checkInterval := r.RetentionCheckInterval
|
||||||
|
if checkInterval > r.Retention {
|
||||||
|
checkInterval = r.Retention
|
||||||
|
}
|
||||||
|
|
||||||
|
ticker := time.NewTicker(checkInterval)
|
||||||
defer ticker.Stop()
|
defer ticker.Stop()
|
||||||
|
|
||||||
for {
|
for {
|
||||||
@@ -495,6 +510,28 @@ func (r *Replica) retainer(ctx context.Context) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// snapshotter runs in a separate goroutine and handles snapshotting.
|
||||||
|
func (r *Replica) snapshotter(ctx context.Context) {
|
||||||
|
if r.SnapshotInterval <= 0 {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
ticker := time.NewTicker(r.SnapshotInterval)
|
||||||
|
defer ticker.Stop()
|
||||||
|
|
||||||
|
for {
|
||||||
|
select {
|
||||||
|
case <-ctx.Done():
|
||||||
|
return
|
||||||
|
case <-ticker.C:
|
||||||
|
if err := r.Snapshot(ctx); err != nil && err != litestream.ErrNoGeneration {
|
||||||
|
log.Printf("%s(%s): snapshotter error: %s", r.db.Path(), r.Name(), err)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// validator runs in a separate goroutine and handles periodic validation.
|
// validator runs in a separate goroutine and handles periodic validation.
|
||||||
func (r *Replica) validator(ctx context.Context) {
|
func (r *Replica) validator(ctx context.Context) {
|
||||||
// Initialize counters since validation occurs infrequently.
|
// Initialize counters since validation occurs infrequently.
|
||||||
@@ -572,6 +609,18 @@ func (r *Replica) CalcPos(ctx context.Context, generation string) (pos litestrea
|
|||||||
return pos, nil
|
return pos, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Snapshot copies the entire database to the replica path.
|
||||||
|
func (r *Replica) Snapshot(ctx context.Context) error {
|
||||||
|
// Find current position of database.
|
||||||
|
pos, err := r.db.Pos()
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("cannot determine current db generation: %w", err)
|
||||||
|
} else if pos.IsZero() {
|
||||||
|
return litestream.ErrNoGeneration
|
||||||
|
}
|
||||||
|
return r.snapshot(ctx, pos.Generation, pos.Index)
|
||||||
|
}
|
||||||
|
|
||||||
// snapshot copies the entire database to the replica path.
|
// snapshot copies the entire database to the replica path.
|
||||||
func (r *Replica) snapshot(ctx context.Context, generation string, index int) error {
|
func (r *Replica) snapshot(ctx context.Context, generation string, index int) error {
|
||||||
// Acquire a read lock on the database during snapshot to prevent checkpoints.
|
// Acquire a read lock on the database during snapshot to prevent checkpoints.
|
||||||
@@ -620,7 +669,7 @@ func (r *Replica) snapshot(ctx context.Context, generation string, index int) er
|
|||||||
r.putOperationTotalCounter.Inc()
|
r.putOperationTotalCounter.Inc()
|
||||||
r.putOperationBytesCounter.Add(float64(fi.Size()))
|
r.putOperationBytesCounter.Add(float64(fi.Size()))
|
||||||
|
|
||||||
log.Printf("%s(%s): snapshot: creating %s/%08x t=%s", r.db.Path(), r.Name(), generation, index, time.Since(startTime))
|
log.Printf("%s(%s): snapshot: creating %s/%08x t=%s", r.db.Path(), r.Name(), generation, index, time.Since(startTime).Truncate(time.Millisecond))
|
||||||
|
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user