Add validation interval
This commit is contained in:
@@ -59,8 +59,6 @@ func (m *Main) Run(ctx context.Context, args []string) (err error) {
|
|||||||
return (&RestoreCommand{}).Run(ctx, args)
|
return (&RestoreCommand{}).Run(ctx, args)
|
||||||
case "snapshots":
|
case "snapshots":
|
||||||
return (&SnapshotsCommand{}).Run(ctx, args)
|
return (&SnapshotsCommand{}).Run(ctx, args)
|
||||||
case "validate":
|
|
||||||
return (&ValidateCommand{}).Run(ctx, args)
|
|
||||||
case "version":
|
case "version":
|
||||||
return (&VersionCommand{}).Run(ctx, args)
|
return (&VersionCommand{}).Run(ctx, args)
|
||||||
case "wal":
|
case "wal":
|
||||||
@@ -190,6 +188,7 @@ type ReplicaConfig struct {
|
|||||||
Retention time.Duration `yaml:"retention"`
|
Retention time.Duration `yaml:"retention"`
|
||||||
RetentionCheckInterval time.Duration `yaml:"retention-check-interval"`
|
RetentionCheckInterval time.Duration `yaml:"retention-check-interval"`
|
||||||
SyncInterval time.Duration `yaml:"sync-interval"` // s3 only
|
SyncInterval time.Duration `yaml:"sync-interval"` // s3 only
|
||||||
|
ValidationInterval time.Duration `yaml:"validation-interval"`
|
||||||
|
|
||||||
// S3 settings
|
// S3 settings
|
||||||
AccessKeyID string `yaml:"access-key-id"`
|
AccessKeyID string `yaml:"access-key-id"`
|
||||||
@@ -292,6 +291,9 @@ func newFileReplicaFromConfig(db *litestream.DB, c *Config, dbc *DBConfig, rc *R
|
|||||||
if v := rc.RetentionCheckInterval; v > 0 {
|
if v := rc.RetentionCheckInterval; v > 0 {
|
||||||
r.RetentionCheckInterval = v
|
r.RetentionCheckInterval = v
|
||||||
}
|
}
|
||||||
|
if v := rc.ValidationInterval; v > 0 {
|
||||||
|
r.ValidationInterval = v
|
||||||
|
}
|
||||||
return r, nil
|
return r, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -341,5 +343,8 @@ func newS3ReplicaFromConfig(db *litestream.DB, c *Config, dbc *DBConfig, rc *Rep
|
|||||||
if v := rc.SyncInterval; v > 0 {
|
if v := rc.SyncInterval; v > 0 {
|
||||||
r.SyncInterval = v
|
r.SyncInterval = v
|
||||||
}
|
}
|
||||||
|
if v := rc.ValidationInterval; v > 0 {
|
||||||
|
r.ValidationInterval = v
|
||||||
|
}
|
||||||
return r, nil
|
return r, nil
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,136 +0,0 @@
|
|||||||
package main
|
|
||||||
|
|
||||||
import (
|
|
||||||
"context"
|
|
||||||
"errors"
|
|
||||||
"flag"
|
|
||||||
"fmt"
|
|
||||||
"log"
|
|
||||||
"os"
|
|
||||||
"path/filepath"
|
|
||||||
|
|
||||||
"github.com/benbjohnson/litestream"
|
|
||||||
)
|
|
||||||
|
|
||||||
type ValidateCommand struct{}
|
|
||||||
|
|
||||||
func (c *ValidateCommand) Run(ctx context.Context, args []string) (err error) {
|
|
||||||
var configPath string
|
|
||||||
opt := litestream.NewRestoreOptions()
|
|
||||||
fs := flag.NewFlagSet("litestream-validate", flag.ContinueOnError)
|
|
||||||
registerConfigFlag(fs, &configPath)
|
|
||||||
fs.StringVar(&opt.ReplicaName, "replica", "", "replica name")
|
|
||||||
fs.BoolVar(&opt.DryRun, "dry-run", false, "dry run")
|
|
||||||
verbose := fs.Bool("v", false, "verbose output")
|
|
||||||
fs.Usage = c.Usage
|
|
||||||
if err := fs.Parse(args); err != nil {
|
|
||||||
return err
|
|
||||||
} else if fs.NArg() == 0 || fs.Arg(0) == "" {
|
|
||||||
return fmt.Errorf("database path required")
|
|
||||||
} else if fs.NArg() > 1 {
|
|
||||||
return fmt.Errorf("too many arguments")
|
|
||||||
}
|
|
||||||
|
|
||||||
// Load configuration.
|
|
||||||
if configPath == "" {
|
|
||||||
return errors.New("-config required")
|
|
||||||
}
|
|
||||||
config, err := ReadConfigFile(configPath)
|
|
||||||
if err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
|
|
||||||
// Verbose output is automatically enabled if dry run is specified.
|
|
||||||
if opt.DryRun {
|
|
||||||
*verbose = true
|
|
||||||
}
|
|
||||||
|
|
||||||
// Instantiate logger if verbose output is enabled.
|
|
||||||
if *verbose {
|
|
||||||
opt.Logger = log.New(os.Stderr, "", log.LstdFlags)
|
|
||||||
}
|
|
||||||
|
|
||||||
// Determine absolute path for database.
|
|
||||||
dbPath, err := filepath.Abs(fs.Arg(0))
|
|
||||||
if err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
|
|
||||||
// Instantiate DB.
|
|
||||||
dbConfig := config.DBConfig(dbPath)
|
|
||||||
if dbConfig == nil {
|
|
||||||
return fmt.Errorf("database not found in config: %s", dbPath)
|
|
||||||
}
|
|
||||||
db, err := newDBFromConfig(&config, dbConfig)
|
|
||||||
if err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
|
|
||||||
// Ensure replica exists, if specified.
|
|
||||||
if opt.ReplicaName != "" && db.Replica(opt.ReplicaName) == nil {
|
|
||||||
return fmt.Errorf("replica not found: %s", opt.ReplicaName)
|
|
||||||
}
|
|
||||||
|
|
||||||
// Validate all matching replicas.
|
|
||||||
var hasInvalidReplica bool
|
|
||||||
for _, r := range db.Replicas {
|
|
||||||
if opt.ReplicaName != "" && opt.ReplicaName != r.Name() {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
|
|
||||||
if err := db.Validate(ctx, r.Name(), opt); err != nil {
|
|
||||||
fmt.Printf("%s: replica invalid: %s\n", r.Name(), err)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if hasInvalidReplica {
|
|
||||||
return fmt.Errorf("one or more invalid replicas found")
|
|
||||||
}
|
|
||||||
|
|
||||||
fmt.Println("ok")
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func (c *ValidateCommand) Usage() {
|
|
||||||
fmt.Printf(`
|
|
||||||
The validate command compares a checksum of the primary database with a
|
|
||||||
checksum of the replica at the same point in time. Returns an error if the
|
|
||||||
databases are not equal.
|
|
||||||
|
|
||||||
The restored database must be written to a temporary file so you must ensure
|
|
||||||
you have enough disk space before performing this operation.
|
|
||||||
|
|
||||||
Usage:
|
|
||||||
|
|
||||||
litestream validate [arguments] DB
|
|
||||||
|
|
||||||
Arguments:
|
|
||||||
|
|
||||||
-config PATH
|
|
||||||
Specifies the configuration file.
|
|
||||||
Defaults to %s
|
|
||||||
|
|
||||||
-replica NAME
|
|
||||||
Validate a specific replica.
|
|
||||||
Defaults to validating all replicas.
|
|
||||||
|
|
||||||
-dry-run
|
|
||||||
Prints all log output as if it were running but does
|
|
||||||
not perform actual validation.
|
|
||||||
|
|
||||||
-v
|
|
||||||
Verbose output.
|
|
||||||
|
|
||||||
|
|
||||||
Examples:
|
|
||||||
|
|
||||||
# Validate all replicas for the given database.
|
|
||||||
$ litestream validate /path/to/db
|
|
||||||
|
|
||||||
# Validate only the S3 replica.
|
|
||||||
$ litestream restore -replica s3 /path/to/db
|
|
||||||
|
|
||||||
`[1:],
|
|
||||||
DefaultConfigPath(),
|
|
||||||
)
|
|
||||||
}
|
|
||||||
138
db.go
138
db.go
@@ -1470,143 +1470,19 @@ func (db *DB) restoreWAL(ctx context.Context, r Replica, generation string, inde
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// Validate restores the most recent data from a replica and validates
|
|
||||||
// that the resulting database matches the current database.
|
|
||||||
func (db *DB) Validate(ctx context.Context, replicaName string, opt RestoreOptions) error {
|
|
||||||
if replicaName == "" {
|
|
||||||
return fmt.Errorf("replica name required")
|
|
||||||
}
|
|
||||||
|
|
||||||
// Look up replica by name.
|
|
||||||
r := db.Replica(replicaName)
|
|
||||||
if r == nil {
|
|
||||||
return fmt.Errorf("replica not found: %q", replicaName)
|
|
||||||
}
|
|
||||||
|
|
||||||
// Ensure logger exists.
|
|
||||||
logger := opt.Logger
|
|
||||||
if logger == nil {
|
|
||||||
logger = log.New(ioutil.Discard, "", 0)
|
|
||||||
}
|
|
||||||
|
|
||||||
logger.Printf("computing primary checksum")
|
|
||||||
|
|
||||||
// Compute checksum of primary database under read lock. This prevents a
|
|
||||||
// sync from occurring and the database will not be written.
|
|
||||||
chksum0, pos, err := db.CRC64()
|
|
||||||
if err != nil {
|
|
||||||
return fmt.Errorf("cannot compute checksum: %w", err)
|
|
||||||
}
|
|
||||||
logger.Printf("primary checksum computed: %08x", chksum0)
|
|
||||||
|
|
||||||
// Wait until replica catches up to position.
|
|
||||||
logger.Printf("waiting for replica")
|
|
||||||
if err := db.waitForReplica(ctx, r, pos, logger); err != nil {
|
|
||||||
return fmt.Errorf("cannot wait for replica: %w", err)
|
|
||||||
}
|
|
||||||
logger.Printf("replica ready, restoring")
|
|
||||||
|
|
||||||
// Restore replica to a temporary directory.
|
|
||||||
tmpdir, err := ioutil.TempDir("", "*-litestream")
|
|
||||||
if err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
defer os.RemoveAll(tmpdir)
|
|
||||||
|
|
||||||
restorePath := filepath.Join(tmpdir, "db")
|
|
||||||
if err := db.Restore(ctx, RestoreOptions{
|
|
||||||
OutputPath: restorePath,
|
|
||||||
ReplicaName: replicaName,
|
|
||||||
Generation: pos.Generation,
|
|
||||||
Index: pos.Index - 1,
|
|
||||||
DryRun: opt.DryRun,
|
|
||||||
Logger: opt.Logger,
|
|
||||||
}); err != nil {
|
|
||||||
return fmt.Errorf("cannot restore: %w", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
// Skip remaining validation if this is just a dry run.
|
|
||||||
if opt.DryRun {
|
|
||||||
return fmt.Errorf("validation stopped, dry run only")
|
|
||||||
}
|
|
||||||
|
|
||||||
logger.Printf("restore complete, computing checksum")
|
|
||||||
|
|
||||||
// Open file handle for restored database.
|
|
||||||
f, err := os.Open(db.Path())
|
|
||||||
if err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
defer f.Close()
|
|
||||||
|
|
||||||
// Compute checksum.
|
|
||||||
h := crc64.New(crc64.MakeTable(crc64.ISO))
|
|
||||||
if _, err := io.Copy(h, f); err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
chksum1 := h.Sum64()
|
|
||||||
|
|
||||||
logger.Printf("replica checksum computed: %08x", chksum1)
|
|
||||||
|
|
||||||
// Validate checksums match.
|
|
||||||
if chksum0 != chksum1 {
|
|
||||||
return ErrChecksumMismatch
|
|
||||||
}
|
|
||||||
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
// waitForReplica blocks until replica reaches at least the given position.
|
|
||||||
func (db *DB) waitForReplica(ctx context.Context, r Replica, pos Pos, logger *log.Logger) error {
|
|
||||||
ticker := time.NewTicker(500 * time.Millisecond)
|
|
||||||
defer ticker.Stop()
|
|
||||||
|
|
||||||
once := make(chan struct{}, 1)
|
|
||||||
once <- struct{}{}
|
|
||||||
|
|
||||||
for {
|
|
||||||
select {
|
|
||||||
case <-ctx.Done():
|
|
||||||
return ctx.Err()
|
|
||||||
case <-ticker.C:
|
|
||||||
case <-once: // immediate on first check
|
|
||||||
}
|
|
||||||
|
|
||||||
// Obtain current position of replica, check if past target position.
|
|
||||||
curr, err := r.CalcPos(ctx, pos.Generation)
|
|
||||||
if err != nil {
|
|
||||||
logger.Printf("cannot obtain replica position: %s", err)
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
|
|
||||||
ready := true
|
|
||||||
if curr.Generation != pos.Generation {
|
|
||||||
ready = false
|
|
||||||
} else if curr.Index < pos.Index {
|
|
||||||
ready = false
|
|
||||||
} else if curr.Index == pos.Index && curr.Offset < pos.Offset {
|
|
||||||
ready = false
|
|
||||||
}
|
|
||||||
|
|
||||||
// If not ready, restart loop.
|
|
||||||
if !ready {
|
|
||||||
logger.Printf("replica at %s, waiting for %s", curr, pos)
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
|
|
||||||
// Current position at or after target position.
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// CRC64 returns a CRC-64 ISO checksum of the database and its current position.
|
// CRC64 returns a CRC-64 ISO checksum of the database and its current position.
|
||||||
//
|
//
|
||||||
// This function obtains a read lock so it prevents syncs from occuring until
|
// This function obtains a read lock so it prevents syncs from occuring until
|
||||||
// the operation is complete. The database will still be usable but it will be
|
// the operation is complete. The database will still be usable but it will be
|
||||||
// unable to checkpoint during this time.
|
// unable to checkpoint during this time.
|
||||||
func (db *DB) CRC64() (uint64, Pos, error) {
|
func (db *DB) CRC64() (uint64, Pos, error) {
|
||||||
db.mu.RLock()
|
db.mu.Lock()
|
||||||
defer db.mu.RUnlock()
|
defer db.mu.Unlock()
|
||||||
|
|
||||||
|
// Force a RESTART checkpoint to ensure the database is at the start of the WAL.
|
||||||
|
if err := db.checkpoint(CheckpointModeRestart); err != nil {
|
||||||
|
return 0, Pos{}, err
|
||||||
|
}
|
||||||
|
|
||||||
// Obtain current position. Clear the offset since we are only reading the
|
// Obtain current position. Clear the offset since we are only reading the
|
||||||
// DB and not applying the current WAL.
|
// DB and not applying the current WAL.
|
||||||
|
|||||||
@@ -34,4 +34,11 @@ var (
|
|||||||
Name: "wal_offset",
|
Name: "wal_offset",
|
||||||
Help: "The current WAL offset",
|
Help: "The current WAL offset",
|
||||||
}, []string{"db", "name"})
|
}, []string{"db", "name"})
|
||||||
|
|
||||||
|
ReplicaValidationTotalCounterVec = promauto.NewCounterVec(prometheus.CounterOpts{
|
||||||
|
Namespace: "litestream",
|
||||||
|
Subsystem: "replica",
|
||||||
|
Name: "validation_total",
|
||||||
|
Help: "The number of validations performed",
|
||||||
|
}, []string{"db", "name", "status"})
|
||||||
)
|
)
|
||||||
|
|||||||
152
replica.go
152
replica.go
@@ -4,6 +4,7 @@ import (
|
|||||||
"compress/gzip"
|
"compress/gzip"
|
||||||
"context"
|
"context"
|
||||||
"fmt"
|
"fmt"
|
||||||
|
"hash/crc64"
|
||||||
"io"
|
"io"
|
||||||
"io/ioutil"
|
"io/ioutil"
|
||||||
"log"
|
"log"
|
||||||
@@ -26,6 +27,9 @@ type Replica interface {
|
|||||||
// String identifier for the type of replica ("file", "s3", etc).
|
// String identifier for the type of replica ("file", "s3", etc).
|
||||||
Type() string
|
Type() string
|
||||||
|
|
||||||
|
// The parent database.
|
||||||
|
DB() *DB
|
||||||
|
|
||||||
// Starts replicating in a background goroutine.
|
// Starts replicating in a background goroutine.
|
||||||
Start(ctx context.Context)
|
Start(ctx context.Context)
|
||||||
|
|
||||||
@@ -102,6 +106,9 @@ type FileReplica struct {
|
|||||||
// Time between checks for retention.
|
// Time between checks for retention.
|
||||||
RetentionCheckInterval time.Duration
|
RetentionCheckInterval time.Duration
|
||||||
|
|
||||||
|
// Time between validation checks.
|
||||||
|
ValidationInterval time.Duration
|
||||||
|
|
||||||
// If true, replica monitors database for changes automatically.
|
// If true, replica monitors database for changes automatically.
|
||||||
// Set to false if replica is being used synchronously (such as in tests).
|
// Set to false if replica is being used synchronously (such as in tests).
|
||||||
MonitorEnabled bool
|
MonitorEnabled bool
|
||||||
@@ -141,6 +148,11 @@ func (r *FileReplica) Type() string {
|
|||||||
return "file"
|
return "file"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// DB returns the parent database reference.
|
||||||
|
func (r *FileReplica) DB() *DB {
|
||||||
|
return r.db
|
||||||
|
}
|
||||||
|
|
||||||
// Path returns the path the replica was initialized with.
|
// Path returns the path the replica was initialized with.
|
||||||
func (r *FileReplica) Path() string {
|
func (r *FileReplica) Path() string {
|
||||||
return r.dst
|
return r.dst
|
||||||
@@ -387,9 +399,10 @@ func (r *FileReplica) Start(ctx context.Context) {
|
|||||||
ctx, r.cancel = context.WithCancel(ctx)
|
ctx, r.cancel = context.WithCancel(ctx)
|
||||||
|
|
||||||
// Start goroutine to replicate data.
|
// Start goroutine to replicate data.
|
||||||
r.wg.Add(2)
|
r.wg.Add(3)
|
||||||
go func() { defer r.wg.Done(); r.monitor(ctx) }()
|
go func() { defer r.wg.Done(); r.monitor(ctx) }()
|
||||||
go func() { defer r.wg.Done(); r.retainer(ctx) }()
|
go func() { defer r.wg.Done(); r.retainer(ctx) }()
|
||||||
|
go func() { defer r.wg.Done(); r.validator(ctx) }()
|
||||||
}
|
}
|
||||||
|
|
||||||
// Stop cancels any outstanding replication and blocks until finished.
|
// Stop cancels any outstanding replication and blocks until finished.
|
||||||
@@ -446,6 +459,28 @@ func (r *FileReplica) retainer(ctx context.Context) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// validator runs in a separate goroutine and handles periodic validation.
|
||||||
|
func (r *FileReplica) validator(ctx context.Context) {
|
||||||
|
if r.ValidationInterval <= 0 {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
ticker := time.NewTicker(r.ValidationInterval)
|
||||||
|
defer ticker.Stop()
|
||||||
|
|
||||||
|
for {
|
||||||
|
select {
|
||||||
|
case <-ctx.Done():
|
||||||
|
return
|
||||||
|
case <-ticker.C:
|
||||||
|
if err := ValidateReplica(ctx, r); err != nil {
|
||||||
|
log.Printf("%s(%s): validation error: %s", r.db.Path(), r.Name(), err)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// CalcPos returns the position for the replica for the current generation.
|
// CalcPos returns the position for the replica for the current generation.
|
||||||
// Returns a zero value if there is no active generation.
|
// Returns a zero value if there is no active generation.
|
||||||
func (r *FileReplica) CalcPos(ctx context.Context, generation string) (pos Pos, err error) {
|
func (r *FileReplica) CalcPos(ctx context.Context, generation string) (pos Pos, err error) {
|
||||||
@@ -932,3 +967,118 @@ func compressFile(src, dst string, uid, gid int) error {
|
|||||||
// Move compressed file to final location.
|
// Move compressed file to final location.
|
||||||
return os.Rename(dst+".tmp", dst)
|
return os.Rename(dst+".tmp", dst)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ValidateReplica restores the most recent data from a replica and validates
|
||||||
|
// that the resulting database matches the current database.
|
||||||
|
func ValidateReplica(ctx context.Context, r Replica) error {
|
||||||
|
db := r.DB()
|
||||||
|
|
||||||
|
log.Printf("%s(%s): computing primary checksum", db.Path(), r.Name())
|
||||||
|
|
||||||
|
// Compute checksum of primary database under lock. This prevents a
|
||||||
|
// sync from occurring and the database will not be written.
|
||||||
|
chksum0, pos, err := db.CRC64()
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("cannot compute checksum: %w", err)
|
||||||
|
}
|
||||||
|
log.Printf("%s(%s): primary checksum computed: %08x", db.Path(), r.Name(), chksum0)
|
||||||
|
|
||||||
|
// Wait until replica catches up to position.
|
||||||
|
log.Printf("%s(%s): waiting for replica", db.Path(), r.Name())
|
||||||
|
if err := waitForReplica(ctx, r, pos); err != nil {
|
||||||
|
return fmt.Errorf("cannot wait for replica: %w", err)
|
||||||
|
}
|
||||||
|
log.Printf("%s(%s): replica ready, restoring", db.Path(), r.Name())
|
||||||
|
|
||||||
|
// Restore replica to a temporary directory.
|
||||||
|
tmpdir, err := ioutil.TempDir("", "*-litestream")
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
defer os.RemoveAll(tmpdir)
|
||||||
|
|
||||||
|
restorePath := filepath.Join(tmpdir, "db")
|
||||||
|
if err := db.Restore(ctx, RestoreOptions{
|
||||||
|
OutputPath: restorePath,
|
||||||
|
ReplicaName: r.Name(),
|
||||||
|
Generation: pos.Generation,
|
||||||
|
Index: pos.Index - 1,
|
||||||
|
Logger: log.New(os.Stderr, "", 0),
|
||||||
|
}); err != nil {
|
||||||
|
return fmt.Errorf("cannot restore: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
log.Printf("%s(%s): restore complete, computing checksum", db.Path(), r.Name())
|
||||||
|
|
||||||
|
// Open file handle for restored database.
|
||||||
|
f, err := os.Open(db.Path())
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
defer f.Close()
|
||||||
|
|
||||||
|
// Compute checksum.
|
||||||
|
h := crc64.New(crc64.MakeTable(crc64.ISO))
|
||||||
|
if _, err := io.Copy(h, f); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
chksum1 := h.Sum64()
|
||||||
|
|
||||||
|
log.Printf("%s(%s): replica checksum computed: %08x", db.Path(), r.Name(), chksum1)
|
||||||
|
|
||||||
|
// Validate checksums match.
|
||||||
|
if chksum0 != chksum1 {
|
||||||
|
internal.ReplicaValidationTotalCounterVec.WithLabelValues(db.Path(), r.Name(), "error").Inc()
|
||||||
|
return ErrChecksumMismatch
|
||||||
|
}
|
||||||
|
|
||||||
|
internal.ReplicaValidationTotalCounterVec.WithLabelValues(db.Path(), r.Name(), "ok").Inc()
|
||||||
|
log.Printf("%s(%s): replica ok", db.Path(), r.Name())
|
||||||
|
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// waitForReplica blocks until replica reaches at least the given position.
|
||||||
|
func waitForReplica(ctx context.Context, r Replica, pos Pos) error {
|
||||||
|
db := r.DB()
|
||||||
|
|
||||||
|
ticker := time.NewTicker(500 * time.Millisecond)
|
||||||
|
defer ticker.Stop()
|
||||||
|
|
||||||
|
once := make(chan struct{}, 1)
|
||||||
|
once <- struct{}{}
|
||||||
|
|
||||||
|
for {
|
||||||
|
select {
|
||||||
|
case <-ctx.Done():
|
||||||
|
return ctx.Err()
|
||||||
|
case <-ticker.C:
|
||||||
|
case <-once: // immediate on first check
|
||||||
|
}
|
||||||
|
|
||||||
|
// Obtain current position of replica, check if past target position.
|
||||||
|
curr, err := r.CalcPos(ctx, pos.Generation)
|
||||||
|
if err != nil {
|
||||||
|
log.Printf("%s(%s): cannot obtain replica position: %s", db.Path(), r.Name(), err)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
ready := true
|
||||||
|
if curr.Generation != pos.Generation {
|
||||||
|
ready = false
|
||||||
|
} else if curr.Index < pos.Index {
|
||||||
|
ready = false
|
||||||
|
} else if curr.Index == pos.Index && curr.Offset < pos.Offset {
|
||||||
|
ready = false
|
||||||
|
}
|
||||||
|
|
||||||
|
// If not ready, restart loop.
|
||||||
|
if !ready {
|
||||||
|
log.Printf("%s(%s): replica at %s, waiting for %s", db.Path(), r.Name(), curr, pos)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
// Current position at or after target position.
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
33
s3/s3.go
33
s3/s3.go
@@ -82,6 +82,9 @@ type Replica struct {
|
|||||||
// Time between retention checks.
|
// Time between retention checks.
|
||||||
RetentionCheckInterval time.Duration
|
RetentionCheckInterval time.Duration
|
||||||
|
|
||||||
|
// Time between validation checks.
|
||||||
|
ValidationInterval time.Duration
|
||||||
|
|
||||||
// If true, replica monitors database for changes automatically.
|
// If true, replica monitors database for changes automatically.
|
||||||
// Set to false if replica is being used synchronously (such as in tests).
|
// Set to false if replica is being used synchronously (such as in tests).
|
||||||
MonitorEnabled bool
|
MonitorEnabled bool
|
||||||
@@ -128,6 +131,11 @@ func (r *Replica) Type() string {
|
|||||||
return "s3"
|
return "s3"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// DB returns the parent database reference.
|
||||||
|
func (r *Replica) DB() *litestream.DB {
|
||||||
|
return r.db
|
||||||
|
}
|
||||||
|
|
||||||
// LastPos returns the last successfully replicated position.
|
// LastPos returns the last successfully replicated position.
|
||||||
func (r *Replica) LastPos() litestream.Pos {
|
func (r *Replica) LastPos() litestream.Pos {
|
||||||
r.mu.RLock()
|
r.mu.RLock()
|
||||||
@@ -410,9 +418,10 @@ func (r *Replica) Start(ctx context.Context) {
|
|||||||
ctx, r.cancel = context.WithCancel(ctx)
|
ctx, r.cancel = context.WithCancel(ctx)
|
||||||
|
|
||||||
// Start goroutines to manage replica data.
|
// Start goroutines to manage replica data.
|
||||||
r.wg.Add(2)
|
r.wg.Add(3)
|
||||||
go func() { defer r.wg.Done(); r.monitor(ctx) }()
|
go func() { defer r.wg.Done(); r.monitor(ctx) }()
|
||||||
go func() { defer r.wg.Done(); r.retainer(ctx) }()
|
go func() { defer r.wg.Done(); r.retainer(ctx) }()
|
||||||
|
go func() { defer r.wg.Done(); r.validator(ctx) }()
|
||||||
}
|
}
|
||||||
|
|
||||||
// Stop cancels any outstanding replication and blocks until finished.
|
// Stop cancels any outstanding replication and blocks until finished.
|
||||||
@@ -477,6 +486,28 @@ func (r *Replica) retainer(ctx context.Context) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// validator runs in a separate goroutine and handles periodic validation.
|
||||||
|
func (r *Replica) validator(ctx context.Context) {
|
||||||
|
if r.ValidationInterval <= 0 {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
ticker := time.NewTicker(r.ValidationInterval)
|
||||||
|
defer ticker.Stop()
|
||||||
|
|
||||||
|
for {
|
||||||
|
select {
|
||||||
|
case <-ctx.Done():
|
||||||
|
return
|
||||||
|
case <-ticker.C:
|
||||||
|
if err := litestream.ValidateReplica(ctx, r); err != nil {
|
||||||
|
log.Printf("%s(%s): validation error: %s", r.db.Path(), r.Name(), err)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// CalcPos returns the position for the replica for the current generation.
|
// CalcPos returns the position for the replica for the current generation.
|
||||||
// Returns a zero value if there is no active generation.
|
// Returns a zero value if there is no active generation.
|
||||||
func (r *Replica) CalcPos(ctx context.Context, generation string) (pos litestream.Pos, err error) {
|
func (r *Replica) CalcPos(ctx context.Context, generation string) (pos litestream.Pos, err error) {
|
||||||
|
|||||||
Reference in New Issue
Block a user