Skip to content

Commit

Permalink
Merge pull request FRRouting#101 from RodrigoMNardi/feature/github/ca…
Browse files Browse the repository at this point in the history
…ncel_by_timer

Execution Hanging Detection
  • Loading branch information
RodrigoMNardi authored Oct 7, 2024
2 parents b329cf1 + caeaf16 commit e472ef8
Show file tree
Hide file tree
Showing 9 changed files with 189 additions and 2 deletions.
1 change: 1 addition & 0 deletions lib/github/plan_execution/finished.rb
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ class Finished
def initialize(payload)
@check_suite = CheckSuite.find_by(bamboo_ci_ref: payload['bamboo_ref'])
@logger = GithubLogger.instance.create('github_plan_execution_finished.log', Logger::INFO)
@hanged = payload['hanged'] || false
end

def finished
Expand Down
15 changes: 13 additions & 2 deletions lib/github/update_status.rb
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ def update_status
case @status
when 'in_progress'
@job.in_progress(@github_check)
create_timeout_worker
when 'success'
@job.success(@github_check)
@job.update_execution_time
Expand All @@ -81,6 +82,16 @@ def update_status
[500, 'Internal Server Error']
end

def create_timeout_worker
Delayed::Job.where('handler LIKE ?', "%TimeoutExecution%args%-%#{@check_suite.id}%")&.delete_all

logger(Logger::INFO, "CiJobStatus::Update: TimeoutExecution for '#{@check_suite.id}'")

TimeoutExecution
.delay(run_at: 2.hours.from_now.utc, queue: 'timeout_execution')
.timeout(@check_suite.id)
end

def insert_new_delayed_job
queue = @job.check_suite.pull_request.github_pr_id % 10

Expand All @@ -91,7 +102,7 @@ def delete_and_create_delayed_job(queue)
fetch_delayed_job&.destroy_all

CiJobStatus
.delay(run_at: DELAYED_JOB_TIMER.seconds.from_now, queue: queue)
.delay(run_at: DELAYED_JOB_TIMER.seconds.from_now.utc, queue: queue)
.update(@job.check_suite.id, @job.id)
end

Expand All @@ -117,7 +128,7 @@ def failure
return failures_stats if @failures.is_a? Array and !@failures.empty?

CiJobFetchTopotestFailures
.delay(run_at: 5.minutes.from_now, queue: 'fetch_topotest_failures')
.delay(run_at: 5.minutes.from_now.utc, queue: 'fetch_topotest_failures')
.update(@job.id, 1)
end

Expand Down
1 change: 1 addition & 0 deletions lib/github_ci_app.rb
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@

# Workers
require_relative '../workers/ci_job_status'
require_relative '../workers/timeout_execution'
require_relative '../workers/ci_job_fetch_topotest_failures'

# Slack libs
Expand Down
4 changes: 4 additions & 0 deletions lib/models/check_suite.rb
Original file line number Diff line number Diff line change
Expand Up @@ -48,4 +48,8 @@ def in_progress?
def execution_started?
ci_jobs.where(status: :in_progress).size < 2
end

def last_job_updated_at_timer
ci_jobs.max_by(&:updated_at)&.updated_at
end
end
1 change: 1 addition & 0 deletions spec/lib/github/update_status_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
before do
allow(Github::PlanExecution::Finished).to receive(:new).and_return(fake_finish_plan)
allow(fake_finish_plan).to receive(:fetch_build_status)
allow(TimeoutExecution).to receive_message_chain(:delay, :timeout).and_return(true)
end

describe 'Validates different Ci Job status' do
Expand Down
42 changes: 42 additions & 0 deletions spec/lib/models/check_suite_spec.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
# SPDX-License-Identifier: BSD-2-Clause
#
# ci_job_spec.rb
# Part of NetDEF CI System
#
# Copyright (c) 2023 by
# Network Device Education Foundation, Inc. ("NetDEF")
#
# frozen_string_literal: true

describe CheckSuite do
context '#execution_started?' do
let(:check_suite) { create(:check_suite) }
let(:check_suite_running) { create(:check_suite, :with_in_progress) }

it 'returns true when there are less than 2 jobs in progress' do
expect(check_suite.execution_started?).to be_truthy
end

it 'returns false' do
expect(check_suite_running.execution_started?).to be_falsey
end
end

context '#last_job_updated_at_timer? -> success' do
let(:ci_job) { create(:ci_job) }
let(:check_suite) { create(:check_suite, ci_jobs: [ci_job]) }

it 'returns false' do
expect(check_suite.last_job_updated_at_timer).not_to be_nil
end
end

context '#last_job_updated_at_timer? -> error' do
let(:ci_job) { create(:ci_job, updated_at: nil) }
let(:check_suite) { create(:check_suite, ci_jobs: []) }

it 'returns false' do
expect(check_suite.last_job_updated_at_timer).to be_nil
end
end
end
2 changes: 2 additions & 0 deletions spec/spec_helper.rb
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,8 @@ def app
config.include FactoryBot::Syntax::Methods
config.include WebMock::API

config.add_formatter('json', 'tmp/rspec_results.json')

pid = nil

config.before(:suite) do
Expand Down
76 changes: 76 additions & 0 deletions spec/workers/timeout_execution_spec.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
# SPDX-License-Identifier: BSD-2-Clause
#
# slack_bot_spec.rb
# Part of NetDEF CI System
#
# Copyright (c) 2023 by
# Network Device Education Foundation, Inc. ("NetDEF")
#
# frozen_string_literal: true

describe TimeoutExecution do
let(:timeout_execution) { described_class.instance }
let(:finished_instance) { Github::PlanExecution::Finished.new({}) }

before do
allow(Github::PlanExecution::Finished).to receive(:new).and_return(finished_instance)
end

context 'when timeout is called, but still running' do
let(:check_suite) { create(:check_suite) }

before do
allow(CheckSuite).to receive(:find).and_return(check_suite)
allow(check_suite).to receive(:finished?).and_return(true)
end

it 'calls timeout job' do
expect(described_class.timeout(check_suite.id)).to be_falsey
end
end

context 'when timeout is called, but hanged' do
let(:check_suite) { create(:check_suite) }

before do
allow(CheckSuite).to receive(:find).and_return(check_suite)
allow(check_suite).to receive(:finished?).and_return(false)
allow(check_suite).to receive(:last_job_updated_at_timer).and_return(Time.now.utc - 3.hours)
allow(finished_instance).to receive(:finished).and_return([200, 'Finished'])
end

it 'calls timeout job' do
expect(described_class.timeout(check_suite.id)).to be_truthy
end
end

context 'when timeout is called and rescheduled' do
let(:check_suite) { create(:check_suite) }

before do
allow(CheckSuite).to receive(:find).and_return(check_suite)
allow(check_suite).to receive(:finished?).and_return(false)
allow(check_suite).to receive(:last_job_updated_at_timer).and_return(Time.now.utc + 3.hours)
allow(TimeoutExecution).to receive_message_chain(:delay, :timeout).and_return(true)
end

it 'calls timeout job' do
expect(described_class.timeout(check_suite.id)).to be_falsey
end
end

context 'when timeout is called, last update in 2 hour ago' do
let(:check_suite) { create(:check_suite) }

before do
allow(CheckSuite).to receive(:find).and_return(check_suite)
allow(check_suite).to receive(:finished?).and_return(false)
allow(check_suite).to receive(:last_job_updated_at_timer).and_return(Time.now.utc - 3.hours)
end

it 'calls timeout job' do
expect(TimeoutExecution).to receive(:timeout)
expect(described_class.timeout(check_suite.id)).to be_falsey
end
end
end
49 changes: 49 additions & 0 deletions workers/timeout_execution.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
# SPDX-License-Identifier: BSD-2-Clause
#
# ci_job_status.rb
# Part of NetDEF CI System
#
# Copyright (c) 2024 by
# Network Device Education Foundation, Inc. ("NetDEF")
#
# frozen_string_literal: true

require_relative '../config/setup'

class TimeoutExecution
class << self
def timeout(check_suite_id)
@logger = GithubLogger.instance.create('timeout_execution_worker.log', Logger::INFO)
check_suite = CheckSuite.find(check_suite_id)

@logger.info("Timeout execution for check_suite_id: #{check_suite_id} -> finished? #{check_suite.finished?}")

return false if check_suite.finished?
return rescheduling([], check_suite_id) if check_suite.last_job_updated_at_timer > 2.hour.ago.utc

@logger.info("Calling Github::PlanExecution::Finished.new(#{check_suite.bamboo_ci_ref}).finished")

rescheduling(finished(check_suite), check_suite_id)
end

def finished(check_suite)
Github::PlanExecution::Finished
.new({ 'bamboo_ref' => check_suite.bamboo_ci_ref, hanged: true })
.finished
end

def rescheduling(resp, check_suite_id)
return true if resp == [200, 'Finished']

@logger.info("Rescheduling check_suite_id: #{check_suite_id}")

Delayed::Job.where('handler LIKE ?', "%TimeoutExecution%args%-%#{check_suite_id}%").delete_all

TimeoutExecution
.delay(run_at: 2.hours.from_now.utc, queue: 'timeout_execution')
.timeout(check_suite_id)

false
end
end
end

0 comments on commit e472ef8

Please sign in to comment.