metrics/configs/failures-config.yaml

metric: failures
description: This query finds jobs that have been failing continuously for a long time.
query: |
  #standardSQL
  select /* Find jobs that have not passed in a long time */
    jobs.job,
    latest_pass, /* how recently did this job pass */
    weekly_builds,  /* how many times a week does it run */
    first_run, /* when is the first time it ran */
    latest_run,  /* when is the most recent run */
    DATE_DIFF(current_date(), if(latest_pass is null, first_run, date(latest_pass)), DAY) broken_days
  from (
    select /* filter to jobs that ran this week */
      job,
      count(1) weekly_builds
    from `kubernetes-public.k8s_infra_kettle.all`
    where
      started > timestamp_sub(current_timestamp(), interval 7 day)
    group by job
    order by job
  ) jobs
  left join (
    select /* find the oldest, newest run of each job */
      job,
      date(min(started)) first_run,
      date(max(started)) latest_run
    from `kubernetes-public.k8s_infra_kettle.all`
    group by job
  ) runs
  on jobs.job = runs.job
  left join (
    select /* find the most recent time each job passed (may not be this week) */
      job,
      max(started) latest_pass
    from `kubernetes-public.k8s_infra_kettle.all`
    where
      result = 'SUCCESS'
    group by job
  ) passes
  on jobs.job = passes.job
  order by broken_days desc, latest_pass, first_run, weekly_builds desc, jobs.job

jqfilter: |
  [(.[] | select((.latest_pass|length) == 0 or (.broken_days|tonumber) > 30)
  | {(.job): {
      failing_days: (.broken_days|tonumber)
  }})] | add