Fix shadow wal corruption on stalled validation

This commit fixes a timing bug that occurs in a specific scenario
where the shadow wal sync stalls because of an s3 validation and
the catch up write to the shadow wal is large enough to allow a
window between WAL reads and the final copy.

The file copy has been replaced by direct writes of the frame
buffer to the shadow to ensure that every validated byte is exactly
what is being written to the shadow wal. The one downside to this
change is that the frame buffer will grow with the transaction
size so it will use additional heap. This can be replaced by a
spill-to-disk implementation but this should work well in the
short term.
This commit is contained in:
Ben Johnson
2021-02-06 07:21:48 -07:00
parent 2ff073c735
commit 7f81890bae
3 changed files with 86 additions and 36 deletions

69
db.go
View File

@@ -1025,66 +1025,65 @@ func (db *DB) copyToShadowWAL(filename string) (newSize int64, err error) {
return 0, fmt.Errorf("last checksum: %w", err) return 0, fmt.Errorf("last checksum: %w", err)
} }
// Seek to correct position on both files. // Seek to correct position on real wal.
if _, err := r.Seek(origSize, io.SeekStart); err != nil { if _, err := r.Seek(origSize, io.SeekStart); err != nil {
return 0, fmt.Errorf("wal seek: %w", err) return 0, fmt.Errorf("real wal seek: %w", err)
} else if _, err := w.Seek(origSize, io.SeekStart); err != nil {
return 0, fmt.Errorf("shadow wal seek: %w", err)
} }
// Read through WAL from last position to find the page of the last // Read through WAL from last position to find the page of the last
// committed transaction. // committed transaction.
tmpSz := origSize frame := make([]byte, db.pageSize+WALFrameHeaderSize)
var buf bytes.Buffer
offset := origSize
lastCommitSize := origSize lastCommitSize := origSize
buf := make([]byte, db.pageSize+WALFrameHeaderSize)
for { for {
Tracef("%s: copy-shadow: %s @ %d", db.path, filename, tmpSz)
// Read next page from WAL file. // Read next page from WAL file.
if _, err := io.ReadFull(r, buf); err == io.EOF || err == io.ErrUnexpectedEOF { if _, err := io.ReadFull(r, frame); err == io.EOF || err == io.ErrUnexpectedEOF {
Tracef("%s: copy-shadow: break %s", db.path, err) Tracef("%s: copy-shadow: break %s @ %d; err=%s", db.path, filename, offset, err)
break // end of file or partial page break // end of file or partial page
} else if err != nil { } else if err != nil {
return 0, fmt.Errorf("read wal: %w", err) return 0, fmt.Errorf("read wal: %w", err)
} }
// Read frame salt & compare to header salt. Stop reading on mismatch. // Read frame salt & compare to header salt. Stop reading on mismatch.
salt0 := binary.BigEndian.Uint32(buf[8:]) salt0 := binary.BigEndian.Uint32(frame[8:])
salt1 := binary.BigEndian.Uint32(buf[12:]) salt1 := binary.BigEndian.Uint32(frame[12:])
if salt0 != hsalt0 || salt1 != hsalt1 { if salt0 != hsalt0 || salt1 != hsalt1 {
Tracef("%s: copy-shadow: break: salt mismatch", db.path) Tracef("%s: copy-shadow: break: salt mismatch", db.path)
break break
} }
// Verify checksum of page is valid. // Verify checksum of page is valid.
fchksum0 := binary.BigEndian.Uint32(buf[16:]) fchksum0 := binary.BigEndian.Uint32(frame[16:])
fchksum1 := binary.BigEndian.Uint32(buf[20:]) fchksum1 := binary.BigEndian.Uint32(frame[20:])
chksum0, chksum1 = Checksum(bo, chksum0, chksum1, buf[:8]) // frame header chksum0, chksum1 = Checksum(bo, chksum0, chksum1, frame[:8]) // frame header
chksum0, chksum1 = Checksum(bo, chksum0, chksum1, buf[24:]) // frame data chksum0, chksum1 = Checksum(bo, chksum0, chksum1, frame[24:]) // frame data
if chksum0 != fchksum0 || chksum1 != fchksum1 { if chksum0 != fchksum0 || chksum1 != fchksum1 {
log.Printf("copy shadow: checksum mismatch, skipping: offset=%d (%x,%x) != (%x,%x)", tmpSz, chksum0, chksum1, fchksum0, fchksum1) log.Printf("copy shadow: checksum mismatch, skipping: offset=%d (%x,%x) != (%x,%x)", offset, chksum0, chksum1, fchksum0, fchksum1)
break break
} }
// Add page to the new size of the shadow WAL. // Add page to the new size of the shadow WAL.
tmpSz += int64(len(buf)) buf.Write(frame)
// Mark commit record. Tracef("%s: copy-shadow: ok %s offset=%d salt=%x %x", db.path, filename, offset, salt0, salt1)
newDBSize := binary.BigEndian.Uint32(buf[4:]) offset += int64(len(frame))
// Flush to shadow WAL if commit record.
newDBSize := binary.BigEndian.Uint32(frame[4:])
if newDBSize != 0 { if newDBSize != 0 {
lastCommitSize = tmpSz if _, err := buf.WriteTo(w); err != nil {
return 0, fmt.Errorf("write shadow wal: %w", err)
}
buf.Reset()
lastCommitSize = offset
} }
} }
// Seek to correct position on both files. // Sync & close.
if _, err := r.Seek(origSize, io.SeekStart); err != nil { if err := w.Sync(); err != nil {
return 0, fmt.Errorf("wal seek: %w", err)
} else if _, err := w.Seek(origSize, io.SeekStart); err != nil {
return 0, fmt.Errorf("shadow wal seek: %w", err)
}
// Copy bytes, sync & close.
if _, err := io.CopyN(w, r, lastCommitSize-origSize); err != nil {
return 0, err
} else if err := w.Sync(); err != nil {
return 0, err return 0, err
} else if err := w.Close(); err != nil { } else if err := w.Close(); err != nil {
return 0, err return 0, err
@@ -1107,6 +1106,8 @@ func (db *DB) ShadowWALReader(pos Pos) (r *ShadowWALReader, err error) {
return nil, err return nil, err
} else if r.N() > 0 { } else if r.N() > 0 {
return r, nil return r, nil
} else if err := r.Close(); err != nil { // no data, close, try next
return nil, err
} }
// Otherwise attempt to read the start of the next WAL file. // Otherwise attempt to read the start of the next WAL file.
@@ -1180,6 +1181,9 @@ type ShadowWALReader struct {
pos Pos pos Pos
} }
// Name returns the filename of the underlying file.
func (r *ShadowWALReader) Name() string { return r.f.Name() }
// Close closes the underlying WAL file handle. // Close closes the underlying WAL file handle.
func (r *ShadowWALReader) Close() error { return r.f.Close() } func (r *ShadowWALReader) Close() error { return r.f.Close() }
@@ -1297,6 +1301,11 @@ func (db *DB) checkpointAndInit(generation, mode string) error {
return err return err
} }
// Copy shadow WAL before checkpoint to copy as much as possible.
if _, err := db.copyToShadowWAL(shadowWALPath); err != nil {
return fmt.Errorf("cannot copy to end of shadow wal before checkpoint: %w", err)
}
// Execute checkpoint and immediately issue a write to the WAL to ensure // Execute checkpoint and immediately issue a write to the WAL to ensure
// a new page is written. // a new page is written.
if err := db.checkpoint(mode); err != nil { if err := db.checkpoint(mode); err != nil {

View File

@@ -95,9 +95,9 @@ type Pos struct {
// String returns a string representation. // String returns a string representation.
func (p Pos) String() string { func (p Pos) String() string {
if p.IsZero() { if p.IsZero() {
return "<>" return ""
} }
return fmt.Sprintf("<%s,%08x,%d>", p.Generation, p.Index, p.Offset) return fmt.Sprintf("%s/%08x:%d", p.Generation, p.Index, p.Offset)
} }
// IsZero returns true if p is the zero value. // IsZero returns true if p is the zero value.

View File

@@ -2,6 +2,7 @@ package litestream
import ( import (
"context" "context"
"encoding/binary"
"fmt" "fmt"
"io" "io"
"io/ioutil" "io/ioutil"
@@ -598,6 +599,8 @@ func (r *FileReplica) Sync(ctx context.Context) (err error) {
} }
generation := dpos.Generation generation := dpos.Generation
Tracef("%s(%s): replica sync: db.pos=%s", r.db.Path(), r.Name(), dpos)
// Create snapshot if no snapshots exist for generation. // Create snapshot if no snapshots exist for generation.
if n, err := r.snapshotN(generation); err != nil { if n, err := r.snapshotN(generation); err != nil {
return err return err
@@ -617,6 +620,7 @@ func (r *FileReplica) Sync(ctx context.Context) (err error) {
return fmt.Errorf("cannot determine replica position: %s", err) return fmt.Errorf("cannot determine replica position: %s", err)
} }
Tracef("%s(%s): replica sync: calc new pos: %s", r.db.Path(), r.Name(), pos)
r.mu.Lock() r.mu.Lock()
r.pos = pos r.pos = pos
r.mu.Unlock() r.mu.Unlock()
@@ -669,11 +673,48 @@ func (r *FileReplica) syncWAL(ctx context.Context) (err error) {
return err return err
} }
n, err := io.Copy(w, rd) // Copy header if at offset zero.
r.walBytesCounter.Add(float64(n)) var psalt uint64 // previous salt value
if pos := rd.Pos(); pos.Offset == 0 {
buf := make([]byte, WALHeaderSize)
if _, err := io.ReadFull(rd, buf); err != nil {
return err
}
psalt = binary.BigEndian.Uint64(buf[16:24])
n, err := w.Write(buf)
if err != nil { if err != nil {
return err return err
} }
r.walBytesCounter.Add(float64(n))
}
// Copy frames.
for {
pos := rd.Pos()
assert(pos.Offset == frameAlign(pos.Offset, r.db.pageSize), "shadow wal reader not frame aligned")
buf := make([]byte, WALFrameHeaderSize+r.db.pageSize)
if _, err := io.ReadFull(rd, buf); err == io.EOF {
break
} else if err != nil {
return err
}
// Verify salt matches the previous frame/header read.
salt := binary.BigEndian.Uint64(buf[8:16])
if psalt != 0 && psalt != salt {
return fmt.Errorf("replica salt mismatch: %s", filepath.Base(filename))
}
psalt = salt
n, err := w.Write(buf)
if err != nil {
return err
}
r.walBytesCounter.Add(float64(n))
}
if err := w.Sync(); err != nil { if err := w.Sync(); err != nil {
return err return err