From 5afd0bf161e590e0bc9369094711db35fc846b52 Mon Sep 17 00:00:00 2001 From: Lincoln Stoll Date: Mon, 31 Oct 2022 09:14:02 +0100 Subject: [PATCH] Handle errors when deleting objects from S3 I recently noticed that the cost for ListBucket calls was increasing for an application that was using Litestream. After investigating it seemed that the bucket had retained the entire history of data, while Litestream was continually logging that it was deleting the same data: ``` 2022-10-30T12:00:27Z (s3): wal segmented deleted before 0792d3393bf79ced/00000233: n=1428 2022-10-30T13:00:24Z (s3): wal segmented deleted before 0792d3393bf79ced/00000233: n=1428 ``` This is occuring because the DeleteObjects call is a batch item, that returns the individual object deletion errors in the response[1]. The S3 replica client discards the response, and only handles errors in the original API call. I had a misconfigured IAM policy that meant all deletes were failing, but this never actually bubbled up as a real error. To fix this, I added a check for the response body to handle any errors the operation might have encountered. Because this may include a large number of errors (in this case 1428 of them), the output is summarized to avoid an overly large error message. When items are not found, they will not return an error[2] - they will still be marked as deleted, so this change should be in-line with the original intentions of this code. 1: https://docs.aws.amazon.com/AmazonS3/latest/API/API_DeleteObjects.html#API_DeleteObjects_Example_2 2: https://docs.aws.amazon.com/AmazonS3/latest/API/API_DeleteObjects.html --- s3/replica_client.go | 45 +++++++++++++++++++++++++++++++++++--------- 1 file changed, 36 insertions(+), 9 deletions(-) diff --git a/s3/replica_client.go b/s3/replica_client.go index b68628a..7632a50 100644 --- a/s3/replica_client.go +++ b/s3/replica_client.go @@ -208,10 +208,14 @@ func (c *ReplicaClient) DeleteGeneration(ctx context.Context, generation string) n = len(objIDs) } - if _, err := c.s3.DeleteObjectsWithContext(ctx, &s3.DeleteObjectsInput{ + out, err := c.s3.DeleteObjectsWithContext(ctx, &s3.DeleteObjectsInput{ Bucket: aws.String(c.Bucket), Delete: &s3.Delete{Objects: objIDs[:n], Quiet: aws.Bool(true)}, - }); err != nil { + }) + if err != nil { + return err + } + if err := deleteOutputError(out); err != nil { return err } internal.OperationTotalCounterVec.WithLabelValues(ReplicaClientType, "DELETE").Inc() @@ -303,10 +307,14 @@ func (c *ReplicaClient) DeleteSnapshot(ctx context.Context, generation string, i return fmt.Errorf("cannot determine snapshot path: %w", err) } - if _, err := c.s3.DeleteObjectsWithContext(ctx, &s3.DeleteObjectsInput{ + out, err := c.s3.DeleteObjectsWithContext(ctx, &s3.DeleteObjectsInput{ Bucket: aws.String(c.Bucket), Delete: &s3.Delete{Objects: []*s3.ObjectIdentifier{{Key: &key}}, Quiet: aws.Bool(true)}, - }); err != nil { + }) + if err != nil { + return err + } + if err := deleteOutputError(out); err != nil { return err } @@ -405,13 +413,16 @@ func (c *ReplicaClient) DeleteWALSegments(ctx context.Context, a []litestream.Po } // Delete S3 objects in bulk. - if _, err := c.s3.DeleteObjectsWithContext(ctx, &s3.DeleteObjectsInput{ + out, err := c.s3.DeleteObjectsWithContext(ctx, &s3.DeleteObjectsInput{ Bucket: aws.String(c.Bucket), Delete: &s3.Delete{Objects: objIDs[:n], Quiet: aws.Bool(true)}, - }); err != nil { + }) + if err != nil { + return err + } + if err := deleteOutputError(out); err != nil { return err } - internal.OperationTotalCounterVec.WithLabelValues(ReplicaClientType, "DELETE").Inc() a = a[n:] @@ -454,10 +465,14 @@ func (c *ReplicaClient) DeleteAll(ctx context.Context) error { n = len(objIDs) } - if _, err := c.s3.DeleteObjectsWithContext(ctx, &s3.DeleteObjectsInput{ + out, err := c.s3.DeleteObjectsWithContext(ctx, &s3.DeleteObjectsInput{ Bucket: aws.String(c.Bucket), Delete: &s3.Delete{Objects: objIDs[:n], Quiet: aws.Bool(true)}, - }); err != nil { + }) + if err != nil { + return err + } + if err := deleteOutputError(out); err != nil { return err } internal.OperationTotalCounterVec.WithLabelValues(ReplicaClientType, "DELETE").Inc() @@ -737,3 +752,15 @@ func isNotExists(err error) bool { return false } } + +func deleteOutputError(out *s3.DeleteObjectsOutput) error { + switch len(out.Errors) { + case 0: + return nil + case 1: + return fmt.Errorf("deleting object %s: %s - %s", *out.Errors[0].Key, *out.Errors[0].Code, *out.Errors[0].Message) + default: + return fmt.Errorf("%d errors occured deleting objects, %s: %s - (%s (and %d others)", + len(out.Errors), *out.Errors[0].Key, *out.Errors[0].Code, *out.Errors[0].Message, len(out.Errors)-1) + } +}