From 1455319c0099a96a2234bce56bfac6195df7d14d Mon Sep 17 00:00:00 2001 From: Philipp Thun Date: Thu, 11 Apr 2024 10:19:07 +0200 Subject: [PATCH] Fix retry for UniqueConstraintViolation in RouteCreate (v3) (#3697) The creation of internal routes depends on finding the next vip_offset. When creating multiple routes in parallel, they find the same vip_offset and only the very first write operation succeeds. All the others fail with a UniqueConstraintViolation error. For this error a retry is triggered in the VCAP::CloudController::RouteCreate action (v3). This retry did not work due to a bug ("undefined local variable or method 'user_audit_info'"). Besides fixing this bug a test has been added to ensure that the parallel creation of internal routes is working. When running 5 parallel threads and each thread is creating an internal route, the following logs are produced: Duplicate entry '1' for key 'routes.routes_vip_offset_index', retrying Duplicate entry '1' for key 'routes.routes_vip_offset_index', retrying Duplicate entry '1' for key 'routes.routes_vip_offset_index', retrying Duplicate entry '1' for key 'routes.routes_vip_offset_index', retrying Duplicate entry '2' for key 'routes.routes_vip_offset_index', retrying Duplicate entry '2' for key 'routes.routes_vip_offset_index', retrying Duplicate entry '2' for key 'routes.routes_vip_offset_index', retrying Duplicate entry '3' for key 'routes.routes_vip_offset_index', retrying Duplicate entry '3' for key 'routes.routes_vip_offset_index', retrying Duplicate entry '4' for key 'routes.routes_vip_offset_index', retrying This indicates that all the write operations are being executed at the same point in time and only one of them succeeds whereas the others are failing, i.e. 4 out of 5. For each retry there is one conflicting route less, so the number of errors goes down from 4 to 3 to 2 to 1 until the last route could be saved (after 4 retries). --- app/actions/route_create.rb | 4 +- .../parallel_route_creation_spec.rb | 40 +++++++++++++++++++ 2 files changed, 42 insertions(+), 2 deletions(-) create mode 100644 spec/integration/parallel_route_creation_spec.rb diff --git a/app/actions/route_create.rb b/app/actions/route_create.rb index 0c3812abba0..21d08fd8fdd 100644 --- a/app/actions/route_create.rb +++ b/app/actions/route_create.rb @@ -35,8 +35,8 @@ def create(message:, space:, domain:, manifest_triggered: false) rescue Sequel::ValidationFailed => e validation_error!(e, route.host, route.path, route.port, space, domain) rescue Sequel::UniqueConstraintViolation => e - logger.warn("error creating route #{e}, retrying once") - RouteCreate.new(user_audit_info).create(message:, space:, domain:, manifest_triggered:) + logger.warn("error creating route #{e}, retrying") + RouteCreate.new(@user_audit_info).create(message:, space:, domain:, manifest_triggered:) end private diff --git a/spec/integration/parallel_route_creation_spec.rb b/spec/integration/parallel_route_creation_spec.rb new file mode 100644 index 00000000000..4230f922cce --- /dev/null +++ b/spec/integration/parallel_route_creation_spec.rb @@ -0,0 +1,40 @@ +require 'spec_helper' + +module VCAP::CloudController + RSpec.describe RouteCreate do + describe 'parallel creation of internal routes' do + it 'retries until find_next_vip_offset does not return a conflicting number' do + # Don't create events + allow_any_instance_of(Repositories::RouteEventRepository).to receive(:record_route_create) + + threads = [] + 10.times do + threads << Thread.new do + user_audit_info = UserAuditInfo.new(user_email: Sham.email, user_guid: Sham.guid) + message = RouteCreateMessage.new(host: Sham.host) + space = Space.make + domain = SharedDomain.make(internal: true) + + route = nil + expect do + route = RouteCreate.new(user_audit_info).create(message:, space:, domain:) + end.not_to raise_error + expect(route).to exist + + # Wait until all routes are created... + sleep(1) + delete_db_entries(route, domain, space) + end + end + threads.each(&:join) + end + end + + def delete_db_entries(route, domain, space) + organization = space.organization + quota_definition = organization.quota_definition + + [route, domain, space, organization, quota_definition].each(&:delete) + end + end +end