From 92f6c212421219456c21190ce2e35357220a2016 Mon Sep 17 00:00:00 2001 From: Justin Cohen Date: Fri, 19 Nov 2021 14:12:08 -0500 Subject: [PATCH] Retry UploadReport on http upload failures. This implements a per-report retry rate limit (as opposed to per upload rate limit in ShouldRateLimitUpload). When a report upload ends in a retry state, an in-memory only timestamp is stored with the next possible retry time. This timestamp is a backoff from the main thread work interval, doubling on each attemt. Because this is only stored in memory, on restart reports in the retry state will always be tried once, and then fall back into the next backoff. This continues until 5 retry attempts are reached. Change-Id: Ibde8855a8a9f0743f0b0bd4d5e3de8a45c64bcb6 Reviewed-on: https://chromium-review.googlesource.com/c/crashpad/crashpad/+/3087723 Reviewed-by: Mark Mentovai Reviewed-by: Robert Sesek Commit-Queue: Justin Cohen --- handler/crash_report_upload_thread.cc | 128 ++++++++++++++++++-------- handler/crash_report_upload_thread.h | 36 ++++++++ util/misc/metrics.h | 4 + util/misc/uuid.cc | 4 + util/misc/uuid.h | 1 + util/misc/uuid_test.cc | 16 ++++ 6 files changed, 149 insertions(+), 40 deletions(-) diff --git a/handler/crash_report_upload_thread.cc b/handler/crash_report_upload_thread.cc index 60f172b6..3872e114 100644 --- a/handler/crash_report_upload_thread.cc +++ b/handler/crash_report_upload_thread.cc @@ -46,6 +46,21 @@ namespace crashpad { +namespace { + +// The number of seconds to wait between checking for pending reports. +const int kRetryWorkIntervalSeconds = 15 * 60; + +#if defined(OS_IOS) +// The number of times to attempt to upload a pending report, repeated on +// failure. Attempts will happen once per launch, once per call to +// ReportPending(), and, if Options.watch_pending_reports is true, once every +// kRetryWorkIntervalSeconds. Currently iOS only. +const int kRetryAttempts = 5; +#endif + +} // namespace + CrashReportUploadThread::CrashReportUploadThread(CrashReportDatabase* database, const std::string& url, const Options& options) @@ -55,7 +70,7 @@ CrashReportUploadThread::CrashReportUploadThread(CrashReportDatabase* database, // absence of a signal from the handler thread. This allows for failed // uploads to be retried periodically, and for pending reports written by // other processes to be recognized. - thread_(options.watch_pending_reports ? 15 * 60.0 + thread_(options.watch_pending_reports ? kRetryWorkIntervalSeconds : WorkerThread::kIndefiniteWait, this), known_pending_report_uuids_(), @@ -154,45 +169,13 @@ void CrashReportUploadThread::ProcessPendingReport( return; } - // This currently implements very simplistic rate-limiting, compatible with - // the Breakpad client, where the strategy is to permit one upload attempt per - // hour, and retire reports that would exceed this limit or for which the - // upload fails on the first attempt. - // - // If upload was requested explicitly (i.e. by user action), we do not - // throttle the upload. - // - // TODO(mark): Provide a proper rate-limiting strategy and allow for failed - // upload attempts to be retried. - if (!report.upload_explicitly_requested && options_.rate_limit) { - time_t last_upload_attempt_time; - if (settings->GetLastUploadAttemptTime(&last_upload_attempt_time)) { - time_t now = time(nullptr); - if (now >= last_upload_attempt_time) { - // If the most recent upload attempt occurred within the past hour, - // don’t attempt to upload the new report. If it happened longer ago, - // attempt to upload the report. - constexpr int kUploadAttemptIntervalSeconds = 60 * 60; // 1 hour - if (now - last_upload_attempt_time < kUploadAttemptIntervalSeconds) { - database_->SkipReportUpload( - report.uuid, Metrics::CrashSkippedReason::kUploadThrottled); - return; - } - } else { - // The most recent upload attempt purportedly occurred in the future. If - // it “happened” at least one day in the future, assume that the last - // upload attempt time is bogus, and attempt to upload the report. If - // the most recent upload time is in the future but within one day, - // accept it and don’t attempt to upload the report. - constexpr int kBackwardsClockTolerance = 60 * 60 * 24; // 1 day - if (last_upload_attempt_time - now < kBackwardsClockTolerance) { - database_->SkipReportUpload( - report.uuid, Metrics::CrashSkippedReason::kUnexpectedTime); - return; - } - } - } - } + if (ShouldRateLimitUpload(report)) + return; + +#if defined(OS_IOS) + if (ShouldRateLimitRetry(report)) + return; +#endif std::unique_ptr upload_report; CrashReportDatabase::OperationStatus status = @@ -234,6 +217,19 @@ void CrashReportUploadThread::ProcessPendingReport( report.uuid, Metrics::CrashSkippedReason::kPrepareForUploadFailed); break; case UploadResult::kRetry: +#if defined(OS_IOS) + if (upload_report->upload_attempts > kRetryAttempts) { + upload_report.reset(); + database_->SkipReportUpload(report.uuid, + Metrics::CrashSkippedReason::kUploadFailed); + } else { + Metrics::CrashUploadSkipped( + Metrics::CrashSkippedReason::kUploadFailedButCanRetry); + retry_uuid_time_map_[report.uuid] = + time(nullptr) + + (1 << upload_report->upload_attempts) * kRetryWorkIntervalSeconds; + } +#else upload_report.reset(); // TODO(mark): Deal with retries properly: don’t call SkipReportUplaod() @@ -241,6 +237,7 @@ void CrashReportUploadThread::ProcessPendingReport( // too many times. database_->SkipReportUpload(report.uuid, Metrics::CrashSkippedReason::kUploadFailed); +#endif break; } } @@ -344,4 +341,55 @@ void CrashReportUploadThread::DoWork(const WorkerThread* thread) { ProcessPendingReports(); } +bool CrashReportUploadThread::ShouldRateLimitUpload( + const CrashReportDatabase::Report& report) { + if (report.upload_explicitly_requested || !options_.rate_limit) + return false; + + Settings* const settings = database_->GetSettings(); + time_t last_upload_attempt_time; + if (settings->GetLastUploadAttemptTime(&last_upload_attempt_time)) { + time_t now = time(nullptr); + if (now >= last_upload_attempt_time) { + // If the most recent upload attempt occurred within the past hour, + // don’t attempt to upload the new report. If it happened longer ago, + // attempt to upload the report. + constexpr int kUploadAttemptIntervalSeconds = 60 * 60; // 1 hour + if (now - last_upload_attempt_time < kUploadAttemptIntervalSeconds) { + database_->SkipReportUpload( + report.uuid, Metrics::CrashSkippedReason::kUploadThrottled); + return true; + } + } else { + // The most recent upload attempt purportedly occurred in the future. If + // it “happened” at least one day in the future, assume that the last + // upload attempt time is bogus, and attempt to upload the report. If + // the most recent upload time is in the future but within one day, + // accept it and don’t attempt to upload the report. + constexpr int kBackwardsClockTolerance = 60 * 60 * 24; // 1 day + if (last_upload_attempt_time - now < kBackwardsClockTolerance) { + database_->SkipReportUpload( + report.uuid, Metrics::CrashSkippedReason::kUnexpectedTime); + return true; + } + } + } + return false; +} + +#if defined(OS_IOS) +bool CrashReportUploadThread::ShouldRateLimitRetry( + const CrashReportDatabase::Report& report) { + if (retry_uuid_time_map_.find(report.uuid) != retry_uuid_time_map_.end()) { + time_t now = time(nullptr); + if (now < retry_uuid_time_map_[report.uuid]) { + return true; + } else { + retry_uuid_time_map_.erase(report.uuid); + } + } + return false; +} +#endif + } // namespace crashpad diff --git a/handler/crash_report_upload_thread.h b/handler/crash_report_upload_thread.h index 1a88a087..7649ae0d 100644 --- a/handler/crash_report_upload_thread.h +++ b/handler/crash_report_upload_thread.h @@ -17,6 +17,7 @@ #include #include +#include #include "client/crash_report_database.h" #include "util/misc/uuid.h" @@ -170,10 +171,45 @@ class CrashReportUploadThread : public WorkerThread::Delegate, //! been called on any thread, as well as periodically on a timer. void DoWork(const WorkerThread* thread) override; + //! \brief Rate-limit uploads. + //! + //! \param[in] report The crash report to process. + //! + //! This currently implements very simplistic rate-limiting, compatible with + //! the Breakpad client, where the strategy is to permit one upload attempt + //! per hour, and retire reports that would exceed this limit or for which the + //! upload fails on the first attempt. + //! If upload was requested explicitly (i.e. by user action), do not throttle + //! the upload. + //! + //! TODO(mark): Provide a proper rate-limiting strategy and allow for failed + //! upload attempts to be retried. + bool ShouldRateLimitUpload(const CrashReportDatabase::Report& report); + +#if defined(OS_IOS) + //! \brief Rate-limit report retries. + //! + //! \param[in] report The crash report to process. + //! + //! This implements a per-report retry rate limit (as opposed to per upload + //! rate limit in ShouldRateLimitUpload). When a report upload ends in a retry + //! state, an in-memory only timestamp is stored in |retry_uuid_time_map_| + //! with the next possible retry time. This timestamp is a backoff from the + //! main thread work interval, doubling on each attemt. Because this is only + //! stored in memory, on restart reports in the retry state will always be + //! tried once, and then fall back into the next backoff. This continues until + //! kRetryAttempts is reached. + bool ShouldRateLimitRetry(const CrashReportDatabase::Report& report); +#endif + const Options options_; const std::string url_; WorkerThread thread_; ThreadSafeVector known_pending_report_uuids_; +#if defined(OS_IOS) + // This is not thread-safe, and only used by the worker thread. + std::map retry_uuid_time_map_; +#endif CrashReportDatabase* database_; // weak }; diff --git a/util/misc/metrics.h b/util/misc/metrics.h index 63c673a3..dcd37a6a 100644 --- a/util/misc/metrics.h +++ b/util/misc/metrics.h @@ -89,6 +89,10 @@ class Metrics { //! and uploading it to the crash server. kPrepareForUploadFailed = 5, + //! \brief The upload of the crash failed during communication with the + //! server, but the upload can be retried later. + kUploadFailedButCanRetry = 6, + //! \brief The number of values in this enumeration; not a valid value. kMaxValue }; diff --git a/util/misc/uuid.cc b/util/misc/uuid.cc index 32ffea52..4ed38311 100644 --- a/util/misc/uuid.cc +++ b/util/misc/uuid.cc @@ -43,6 +43,10 @@ bool UUID::operator==(const UUID& that) const { return memcmp(this, &that, sizeof(*this)) == 0; } +bool UUID::operator<(const UUID& that) const { + return memcmp(this, &that, sizeof(*this)) < 0; +} + void UUID::InitializeToZero() { memset(this, 0, sizeof(*this)); } diff --git a/util/misc/uuid.h b/util/misc/uuid.h index 291baa9f..7e504a22 100644 --- a/util/misc/uuid.h +++ b/util/misc/uuid.h @@ -39,6 +39,7 @@ namespace crashpad { struct UUID { bool operator==(const UUID& that) const; bool operator!=(const UUID& that) const { return !operator==(that); } + bool operator<(const UUID& that) const; //! \brief Initializes the %UUID to zero. void InitializeToZero(); diff --git a/util/misc/uuid_test.cc b/util/misc/uuid_test.cc index 936e76bf..28d7ce4b 100644 --- a/util/misc/uuid_test.cc +++ b/util/misc/uuid_test.cc @@ -85,24 +85,40 @@ TEST(UUID, UUID) { EXPECT_EQ(uuid_2, uuid); EXPECT_FALSE(uuid != uuid_2); + // Test operator< operator + UUID uuid_3{}; + UUID uuid_4; + uuid_4.InitializeFromString("11111111-1111-1111-1111-111111111111"); + UUID uuid_5; + uuid_5.InitializeFromString("22222222-2222-2222-2222-222222222222"); + + EXPECT_LT(uuid_3, uuid_4); + EXPECT_LT(uuid_3, uuid_5); + EXPECT_LT(uuid_4, uuid_5); + // Make sure that operator== and operator!= check the entire UUID. ++uuid.data_1; EXPECT_NE(uuid, uuid_2); + EXPECT_LT(uuid_2, uuid); --uuid.data_1; ++uuid.data_2; EXPECT_NE(uuid, uuid_2); + EXPECT_LT(uuid_2, uuid); --uuid.data_2; ++uuid.data_3; EXPECT_NE(uuid, uuid_2); + EXPECT_LT(uuid_2, uuid); --uuid.data_3; for (size_t index = 0; index < base::size(uuid.data_4); ++index) { ++uuid.data_4[index]; EXPECT_NE(uuid, uuid_2); + EXPECT_LT(uuid_2, uuid); --uuid.data_4[index]; } for (size_t index = 0; index < base::size(uuid.data_5); ++index) { ++uuid.data_5[index]; EXPECT_NE(uuid, uuid_2); + EXPECT_LT(uuid_2, uuid); --uuid.data_5[index]; }