1
0
Fork 0

Add a script to fetch and diff GitHub workflow logs

This commit is contained in:
krcroft 2020-01-29 22:48:19 -08:00 committed by Patryk Obara
parent 89e9e7336f
commit 94cd6df6cd

View file

@ -0,0 +1,471 @@
#!/bin/bash
# Copyright (C) 2020 Kevin R Croft <krcroft@gmail.com>
# SPDX-License-Identifier: GPL-2.0-or-later
##
# This script craws the current repo's GitHub workflow content.
# It fetches assets and logs from the most recent successful master
# run followed by the most recent (and possibly failing) current
# branch, which can also be a master branch.
# The goal of this script is two fold:
# - Provide a mechanized an automated way to fetch CI records.
# - Provide a rapid way to diff bad CI runs against master.
#
# This script requires a GitHub account in order to generate an
# auth-token. Simply run the script, it will provide instructions.
#
set -euo pipefail
shopt -s nullglob
# Fixed portion of the URL
declare -gr scheme="https://"
declare -gr authority="api.github.com"
# Colors
declare -gr bold="\\e[1m"
declare -gr red="\\e[31m"
declare -gr green="\\e[32m"
declare -gr yellow="\\e[33m"
declare -gr cyan="\\e[36m"
declare -gr reset="\\e[0m"
##
# Changes the working directory to that of the
# repository's root.
#
function cd_repo_root() {
if [[ "${in_root:-}" != "true" ]]; then
script_path="$( cd "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P )"
pushd "$script_path" > /dev/null
pushd "$(git rev-parse --show-toplevel)" > /dev/null
in_root=true
fi
}
##
# Determines the full GitHub repo name using the
# remote origin set in the repository's configuration
#
function init_baseurl() {
cd_repo_root
# Extract the full GitHub repo name from the origin
repo="$(git config --get remote.origin.url | sed 's/.*://;s/\.git$//;s^//[^/]*/^^')"
baseurl="$scheme/$authority/repos/$repo/actions"
declare -gr repo
declare -gr baseurl
}
##
# Determines the local branch name
#
function init_local_branch() {
cd_repo_root
local_branch="$(git rev-parse --abbrev-ref HEAD)"
declare -gr local_branch
}
##
# Sets up NETRC credentials for GitHub's v3 API server.
# NETRC is currently the most secure way to provide
# credentials to CURL, because it prevents other processes
# from inspect the process environment and cmd arguments;
# both of which can be found in /proc).
#
function init_netrc() {
# Check and setup the GitHub personal access token
local netrc="$HOME/.netrc"
if grep -q "^machine $authority" "$netrc"; then
echo "Found credentials for $authority in $netrc"
return
fi
# Get the username
clear
echo ""
echo "1. Enter your GitHub account email address, example: jcousteau@scuba.fr"
echo ""
read -r -p "GitHub account email: " username
# Get the token
clear
echo ""
echo "2. Login to GitHub and visit https://github.com/settings/tokens"
echo ""
echo "3. Click 'Generate new token' and set the 'public_repo' permission:"
echo ""
echo " [ ] repo Full control of private repos"
echo " [ ] repo:status Access commit status"
echo " [ ] repo_deployment Access deployment status"
echo " [X] public_repo Access public repositories"
echo " [ ] repo:invite Access repository invitations"
echo ""
echo " Type a name for the token then scroll down and click 'Generate Token'."
echo ""
echo "4. Copy & paste your token, for example: f7e6b2344bd2c1487597b61d77703527a692a072"
echo ""
# Deliberately echo the token so the user can verify its correctness
read -r -p "Personal Access Token: " token
# Add the credential to netrc
clear
if [[ -n "${username:-}" && -n "${token:-}" ]]; then
{
echo "machine $authority"
echo "login $username"
echo "password $token"
echo ""
} >> "$netrc"
echo "Added your credentials to $netrc"
# Ensure netrc is only readable by the user
chmod 600 "$netrc"
else
echo "Failed to setup $netrc or some of the credentials were empty!"
exit 1
fi
}
##
# Makes strings suitable for directory and filenames
# - spaces => underscores
# - upper-case => lower-case
# - slashes => dashes
# - parenthesis => stripped
# - equals => dashes
#
function sanitize_name() {
local name="$1"
echo "$name" | sed 's/\(.*\)/\L\1/;s/ /_/g;s^/^-^g;s/[()]//g;s/=/-/g'
}
##
# Returns how old a file or directory is, in seconds.
#
function seconds_old() {
echo $(( "$(date +%s)" - "$(stat -L --format %Y "$1")" ))
}
##
# Creates a storage area for all content fetched by the script.
# This include cached JSON output (valid for 5 minutes), along
# with zip assets, log files, and diffs.
#
declare -g parent # used by the trap
function init_dirs() {
local repo_postfix
repo_postfix="$(basename "$repo")"
parent="/tmp/$repo_postfix-workflows-$USER"
assets_dir="$parent/assets"
cache_dir="$parent/cache"
diffs_dir="$parent/diffs"
logs_dir="$parent/logs"
declare -gr assets_dir
declare -gr cache_dir
declare -gr diffs_dir
declare -gr logs_dir
echo "Initializing storage area: $parent"
# Don't trust content from a prior interrupted run
if [[ -f "$parent/.interrupted" ]]; then
rm -rf "$parent"
fi
# Make the directories if they don't exist
for dir in "$assets_dir" "$cache_dir" "$diffs_dir" "$logs_dir"; do
if [[ ! -d "$dir" ]]; then
mkdir -p "$dir"
# Otherwise, purge content older than 5-minutes
else
for filename in "$dir"/*; do
if [[ "$(seconds_old "$filename")" -gt 300 ]]; then
rm -rf "$filename"
fi
done
fi
done
# If the user Ctrl-C'd the job, then some files might be
# partially written, so drop a breadcrumb to clean up next run.
# (we could just blow away the content here, but we want to
# let the user inspect content after interrupting the run.)
trap 'touch $parent/.interrupted' INT
}
##
# Ensures all pre-requisites are setup and have passed
# before we start making REST queries and writing files.
#
function init() {
init_baseurl
init_local_branch
init_netrc
init_dirs
}
##
# Downloads a file if we otherwise don't have it.
# (Note that the script on launch cleans up files older than
# 5 minutes, so most of the time we'll be downloading.)
#
function download() {
local url="$1"
local outfile="$2"
if [[ ! -f "$outfile" ]]; then
curl --silent \
--location \
--netrc \
"$url" \
-o "$outfile"
fi
}
##
# Unzips files inside their containing directory.
# Clobbers existing files.
#
function unpack() {
local zipfile="$1"
local zipdir
zipdir="$(dirname "$zipfile")"
pushd "$zipdir" > /dev/null
unzip -qq -o "$zipfile"
rm -f "$zipfile"
popd > /dev/null
}
##
# Constructs and fetches REST urls using our personal access
# token. Files are additionally hashed based on the REST URL
# and cached. This allows for rapid-rerunning without needing
# to hit GitHub's API again (for duplicate requests). This
# avoid us exceeding our repo limit on API calls/day.
#
function pull() {
# Buildup the REST URL by appending arguments
local url="$baseurl"
for element in "$@"; do
url="$url/$element"
done
local url_hash
url_hash="$(echo "$url" | md5sum | cut -f1 -d' ')"
local outfile="${cache_dir}/${url_hash}.json"
if [[ ! -f "$outfile" ]]; then
download "$url" "$outfile"
fi
cat "$outfile"
}
##
# Gets one or more keys from all records
#
function get_all() {
local container="$1"
local return_keys="$2"
jq -r '.'"$container"'[] | '"${return_keys}"
}
##
# Gets one or more return_key(s) from records that have
# matching search_key and search_value hits
#
function query() {
local container="$1"
local search_key="$2"
local search_value="$3"
local return_keys="$4"
jq -r --arg value "$search_value"\
'.'"${container}"'[] | if .'"${search_key}"' == $value then '"${return_keys}"' else empty end'
}
##
# Pulls the subset of active workflows from GitHub having
# path values that match the local repos filenames inside
# .github/workflows (otherwise there are 30+ workflows).
#
# The workflow numeric ID and textual name are stored
# in an associated array, respectively.
#
# API References:
# - https://developer.github.com/v3/actions/workflows/
# - GET /repos/:owner/:repo/actions/workflows
#
function fetch_workflows() {
unset workflows
declare -gA workflows
for workflow_path in ".github/workflows/"*.yml; do
local result
result="$(pull workflows \
| query workflows path "$workflow_path" '.id,.name')"
local id
id="${result%$'\n'*}"
local name
name="${result#*$'\n'}"
# Skip empty values and a couple master-only workflows
if [[ -z "${id:-}" \
|| -z "${name:-}" \
|| "$name" == "Config heavy" \
|| "$name" == "Coverity Scan" ]]; then
continue
fi
workflows["$id"]="$(sanitize_name "$name")"
done
}
##
# Fetches the first run identifier for a given workflow ID
# and branch name. The run ID is stored in the run_id variable.
#
# API References:
# - https://developer.github.com/v3/actions/workflow_runs
# - GET /repos/:owner/:repo/actions/runs/:run_id
#
function fetch_workflow_run() {
declare -g run_id
local workflow_id="$1"
local branch="$2"
# GET /repos/:owner/:repo/actions/workflows/:workflow_id/runs
run_id="$(pull workflows "$workflow_id" runs \
| query workflow_runs head_branch "$branch" '.id' \
| head -1)"
}
##
# Fetches artifact names and download URLs for a given run ID,
# and stored them in an assiciative array, respectively.
#
# API References:
# - https://developer.github.com/v3/actions/artifacts
# - GET /repos/:owner/:repo/actions/runs/:run_id/artifacts
#
function fetch_run_artifacts() {
unset artifacts
declare -gA artifacts
while read -r name; do
read -r url
sanitized_name="$(sanitize_name "$name")"
artifacts["$sanitized_name"]="$url"
done < <(pull runs "$run_id" artifacts \
| get_all artifacts '.name,.archive_download_url')
}
##
# Fetches the job IDs and job names for a given run ID.
# The job IDs and names are stored in an associative array,
# respectively.
#
# API References:
# - https://developer.github.com/v3/actions/workflow_jobs
# - GET /repos/:owner/:repo/actions/runs/:run_id/jobs
#
function fetch_run_jobs() {
unset jobs_array
declare -gA jobs_array
local conclusion="$1" # success or failure
while read -r id; do
read -r name
jobs_array["$id"]="$(sanitize_name "$name")"
done < <(pull runs "$run_id" jobs \
| query jobs conclusion "$conclusion" '.id,.name')
}
##
# Fetches a job's log, and saves it in the provided output
# filename. The logs prefix time-stamps are filtered for easier
# text processing.
#
# API References:
# - https://developer.github.com/v3/actions/workflow_jobs
# - GET /repos/:owner/:repo/actions/jobs/:job_id/logs
#
function fetch_job_log() {
local jid="$1"
local outfile="$2"
pull jobs "$jid" logs \
| sed 's/^.*Z[ \t]*//;s/:[[:digit:]]*:[[:digit:]]*://;s/\[/./g;s/\]/./g' \
> "$outfile"
}
##
# Crawl workflows, runs, and jobs for the master and current branch.
# While crawling, download assets and logs, and if a run failed, diff
# that log against the last successful master-equivalent having the same
# workflow and job type.
#
# TODO - Refactor into smaller functions and trying to flatten the loop depth.
# TODO - Improve the log differ to something that can lift out just the
# gcc/clang/vistual-studio warnings and errors, and diff them.
#
function main() {
# Setup all pre-requisites
init
echo "Comparing branch $local_branch with master"
echo ""
# Fetch the workflows, to be used throughout the run
fetch_workflows
# Step through each workflow
for workflow_id in "${!workflows[@]}"; do
workflow_name="${workflows[$workflow_id]}"
echo -e "${bold}${workflow_name}${reset} workflow [$workflow_id]"
# Within the workflows, we're interested in finding the newest subset of
# runs that match our current branch as well as the master branch.
for branch in master current; do
if [[ "$branch" == "current" ]]; then
branch_name="$local_branch"
else
branch_name="master"
fi
# Get the run identifier for the given workflow and branch
fetch_workflow_run "$workflow_id" "$branch_name"
if [[ -z "${run_id:-}" ]]; then
echo " \`- no runs found for $branch_name"
continue
fi
[[ "$branch" == "master" ]] && joiner="|-" || joiner="\`-"
echo " $joiner found latest $branch_name run [$run_id]"
# Download the artifacts produced during the selected run
fetch_run_artifacts
[[ "$branch" == "master" ]] && joiner="|" || joiner=" "
for artifact_name in "${!artifacts[@]}"; do
artifact_url="${artifacts[$artifact_name]}"
asset_file="$assets_dir/$workflow_name-$artifact_name-$branch.zip"
download "$artifact_url" "$asset_file"
unpack "$asset_file"
echo -e " $joiner - unpacking ${cyan}${artifact_name}${reset} asset"
done
# Download the logs for the jobs within the selected run
for conclusion in failure success; do
if ! fetch_run_jobs "$conclusion"; then
echo " \`- skipped $job_id $conclusion"
continue
fi
[[ "$conclusion" == "success" ]] && color="${green}" || color="${red}"
for job_id in "${!jobs_array[@]}"; do
job_name="${jobs_array[$job_id]}"
echo -e " $joiner - fetching ${color}${job_name}${reset} ${conclusion} log"
log_file="$logs_dir/$workflow_name-$job_name-$branch-$conclusion.log"
successful_master_log="$logs_dir/$workflow_name-$job_name-master-success.log"
fetch_job_log "$job_id" "$log_file"
# In the event we've found a failed job, try to diff it against a prior
# successful master job of the equivalent workflow and job-type.
if [[ "$conclusion" == "failure"
&& -f "$log_file"
&& -f "$successful_master_log" ]]; then
sanitized_branch_name="$(sanitize_name "$branch_name")"
diff_file="$diffs_dir/$workflow_name-$job_name-$sanitized_branch_name-vs-master.log"
diff "$log_file" "$successful_master_log" > "$diff_file" || true
echo -e " - diffed ${yellow}$diff_file${reset}"
fi
done # jobs_array
done # conclusion types
echo " $joiner"
done # branch types
done # workflows
}
main