diff --git a/scripts/diff-workflow-against-master.sh b/scripts/diff-workflow-against-master.sh new file mode 100755 index 00000000..88f1b314 --- /dev/null +++ b/scripts/diff-workflow-against-master.sh @@ -0,0 +1,471 @@ +#!/bin/bash + +# Copyright (C) 2020 Kevin R Croft +# SPDX-License-Identifier: GPL-2.0-or-later + +## +# This script craws the current repo's GitHub workflow content. +# It fetches assets and logs from the most recent successful master +# run followed by the most recent (and possibly failing) current +# branch, which can also be a master branch. +# The goal of this script is two fold: +# - Provide a mechanized an automated way to fetch CI records. +# - Provide a rapid way to diff bad CI runs against master. +# +# This script requires a GitHub account in order to generate an +# auth-token. Simply run the script, it will provide instructions. +# +set -euo pipefail +shopt -s nullglob + +# Fixed portion of the URL +declare -gr scheme="https://" +declare -gr authority="api.github.com" + +# Colors +declare -gr bold="\\e[1m" +declare -gr red="\\e[31m" +declare -gr green="\\e[32m" +declare -gr yellow="\\e[33m" +declare -gr cyan="\\e[36m" +declare -gr reset="\\e[0m" + +## +# Changes the working directory to that of the +# repository's root. +# +function cd_repo_root() { + if [[ "${in_root:-}" != "true" ]]; then + script_path="$( cd "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P )" + pushd "$script_path" > /dev/null + pushd "$(git rev-parse --show-toplevel)" > /dev/null + in_root=true + fi +} + +## +# Determines the full GitHub repo name using the +# remote origin set in the repository's configuration +# +function init_baseurl() { + cd_repo_root + # Extract the full GitHub repo name from the origin + repo="$(git config --get remote.origin.url | sed 's/.*://;s/\.git$//;s^//[^/]*/^^')" + baseurl="$scheme/$authority/repos/$repo/actions" + declare -gr repo + declare -gr baseurl +} + +## +# Determines the local branch name +# +function init_local_branch() { + cd_repo_root + local_branch="$(git rev-parse --abbrev-ref HEAD)" + declare -gr local_branch +} + +## +# Sets up NETRC credentials for GitHub's v3 API server. +# NETRC is currently the most secure way to provide +# credentials to CURL, because it prevents other processes +# from inspect the process environment and cmd arguments; +# both of which can be found in /proc). +# +function init_netrc() { + # Check and setup the GitHub personal access token + local netrc="$HOME/.netrc" + if grep -q "^machine $authority" "$netrc"; then + echo "Found credentials for $authority in $netrc" + return + fi + # Get the username + clear + echo "" + echo "1. Enter your GitHub account email address, example: jcousteau@scuba.fr" + echo "" + read -r -p "GitHub account email: " username + + # Get the token + clear + echo "" + echo "2. Login to GitHub and visit https://github.com/settings/tokens" + echo "" + echo "3. Click 'Generate new token' and set the 'public_repo' permission:" + echo "" + echo " [ ] repo Full control of private repos" + echo " [ ] repo:status Access commit status" + echo " [ ] repo_deployment Access deployment status" + echo " [X] public_repo Access public repositories" + echo " [ ] repo:invite Access repository invitations" + echo "" + echo " Type a name for the token then scroll down and click 'Generate Token'." + echo "" + echo "4. Copy & paste your token, for example: f7e6b2344bd2c1487597b61d77703527a692a072" + echo "" + # Deliberately echo the token so the user can verify its correctness + read -r -p "Personal Access Token: " token + + # Add the credential to netrc + clear + if [[ -n "${username:-}" && -n "${token:-}" ]]; then + { + echo "machine $authority" + echo "login $username" + echo "password $token" + echo "" + } >> "$netrc" + echo "Added your credentials to $netrc" + # Ensure netrc is only readable by the user + chmod 600 "$netrc" + else + echo "Failed to setup $netrc or some of the credentials were empty!" + exit 1 + fi +} + +## +# Makes strings suitable for directory and filenames +# - spaces => underscores +# - upper-case => lower-case +# - slashes => dashes +# - parenthesis => stripped +# - equals => dashes +# +function sanitize_name() { + local name="$1" + echo "$name" | sed 's/\(.*\)/\L\1/;s/ /_/g;s^/^-^g;s/[()]//g;s/=/-/g' +} + +## +# Returns how old a file or directory is, in seconds. +# +function seconds_old() { + echo $(( "$(date +%s)" - "$(stat -L --format %Y "$1")" )) +} + +## +# Creates a storage area for all content fetched by the script. +# This include cached JSON output (valid for 5 minutes), along +# with zip assets, log files, and diffs. +# +declare -g parent # used by the trap +function init_dirs() { + local repo_postfix + repo_postfix="$(basename "$repo")" + parent="/tmp/$repo_postfix-workflows-$USER" + assets_dir="$parent/assets" + cache_dir="$parent/cache" + diffs_dir="$parent/diffs" + logs_dir="$parent/logs" + declare -gr assets_dir + declare -gr cache_dir + declare -gr diffs_dir + declare -gr logs_dir + echo "Initializing storage area: $parent" + + # Don't trust content from a prior interrupted run + if [[ -f "$parent/.interrupted" ]]; then + rm -rf "$parent" + fi + + # Make the directories if they don't exist + for dir in "$assets_dir" "$cache_dir" "$diffs_dir" "$logs_dir"; do + if [[ ! -d "$dir" ]]; then + mkdir -p "$dir" + # Otherwise, purge content older than 5-minutes + else + for filename in "$dir"/*; do + if [[ "$(seconds_old "$filename")" -gt 300 ]]; then + rm -rf "$filename" + fi + done + fi + done + # If the user Ctrl-C'd the job, then some files might be + # partially written, so drop a breadcrumb to clean up next run. + # (we could just blow away the content here, but we want to + # let the user inspect content after interrupting the run.) + trap 'touch $parent/.interrupted' INT +} + +## +# Ensures all pre-requisites are setup and have passed +# before we start making REST queries and writing files. +# +function init() { + init_baseurl + init_local_branch + init_netrc + init_dirs +} + +## +# Downloads a file if we otherwise don't have it. +# (Note that the script on launch cleans up files older than +# 5 minutes, so most of the time we'll be downloading.) +# +function download() { + local url="$1" + local outfile="$2" + if [[ ! -f "$outfile" ]]; then + curl --silent \ + --location \ + --netrc \ + "$url" \ + -o "$outfile" + fi +} + +## +# Unzips files inside their containing directory. +# Clobbers existing files. +# +function unpack() { + local zipfile="$1" + local zipdir + zipdir="$(dirname "$zipfile")" + pushd "$zipdir" > /dev/null + unzip -qq -o "$zipfile" + rm -f "$zipfile" + popd > /dev/null +} + +## +# Constructs and fetches REST urls using our personal access +# token. Files are additionally hashed based on the REST URL +# and cached. This allows for rapid-rerunning without needing +# to hit GitHub's API again (for duplicate requests). This +# avoid us exceeding our repo limit on API calls/day. +# +function pull() { + # Buildup the REST URL by appending arguments + local url="$baseurl" + for element in "$@"; do + url="$url/$element" + done + local url_hash + url_hash="$(echo "$url" | md5sum | cut -f1 -d' ')" + local outfile="${cache_dir}/${url_hash}.json" + if [[ ! -f "$outfile" ]]; then + download "$url" "$outfile" + fi + cat "$outfile" +} + +## +# Gets one or more keys from all records +# +function get_all() { + local container="$1" + local return_keys="$2" + jq -r '.'"$container"'[] | '"${return_keys}" +} + +## +# Gets one or more return_key(s) from records that have +# matching search_key and search_value hits +# +function query() { + local container="$1" + local search_key="$2" + local search_value="$3" + local return_keys="$4" + jq -r --arg value "$search_value"\ + '.'"${container}"'[] | if .'"${search_key}"' == $value then '"${return_keys}"' else empty end' +} + +## +# Pulls the subset of active workflows from GitHub having +# path values that match the local repos filenames inside +# .github/workflows (otherwise there are 30+ workflows). +# +# The workflow numeric ID and textual name are stored +# in an associated array, respectively. +# +# API References: +# - https://developer.github.com/v3/actions/workflows/ +# - GET /repos/:owner/:repo/actions/workflows +# +function fetch_workflows() { + unset workflows + declare -gA workflows + for workflow_path in ".github/workflows/"*.yml; do + local result + result="$(pull workflows \ + | query workflows path "$workflow_path" '.id,.name')" + local id + id="${result%$'\n'*}" + local name + name="${result#*$'\n'}" + + # Skip empty values and a couple master-only workflows + if [[ -z "${id:-}" \ + || -z "${name:-}" \ + || "$name" == "Config heavy" \ + || "$name" == "Coverity Scan" ]]; then + continue + fi + workflows["$id"]="$(sanitize_name "$name")" + done +} + +## +# Fetches the first run identifier for a given workflow ID +# and branch name. The run ID is stored in the run_id variable. +# +# API References: +# - https://developer.github.com/v3/actions/workflow_runs +# - GET /repos/:owner/:repo/actions/runs/:run_id +# +function fetch_workflow_run() { + declare -g run_id + local workflow_id="$1" + local branch="$2" + # GET /repos/:owner/:repo/actions/workflows/:workflow_id/runs + run_id="$(pull workflows "$workflow_id" runs \ + | query workflow_runs head_branch "$branch" '.id' \ + | head -1)" +} + +## +# Fetches artifact names and download URLs for a given run ID, +# and stored them in an assiciative array, respectively. +# +# API References: +# - https://developer.github.com/v3/actions/artifacts +# - GET /repos/:owner/:repo/actions/runs/:run_id/artifacts +# +function fetch_run_artifacts() { + unset artifacts + declare -gA artifacts + while read -r name; do + read -r url + sanitized_name="$(sanitize_name "$name")" + artifacts["$sanitized_name"]="$url" + done < <(pull runs "$run_id" artifacts \ + | get_all artifacts '.name,.archive_download_url') +} + +## +# Fetches the job IDs and job names for a given run ID. +# The job IDs and names are stored in an associative array, +# respectively. +# +# API References: +# - https://developer.github.com/v3/actions/workflow_jobs +# - GET /repos/:owner/:repo/actions/runs/:run_id/jobs +# +function fetch_run_jobs() { + unset jobs_array + declare -gA jobs_array + local conclusion="$1" # success or failure + while read -r id; do + read -r name + jobs_array["$id"]="$(sanitize_name "$name")" + done < <(pull runs "$run_id" jobs \ + | query jobs conclusion "$conclusion" '.id,.name') +} + +## +# Fetches a job's log, and saves it in the provided output +# filename. The logs prefix time-stamps are filtered for easier +# text processing. +# +# API References: +# - https://developer.github.com/v3/actions/workflow_jobs +# - GET /repos/:owner/:repo/actions/jobs/:job_id/logs +# +function fetch_job_log() { + local jid="$1" + local outfile="$2" + pull jobs "$jid" logs \ + | sed 's/^.*Z[ \t]*//;s/:[[:digit:]]*:[[:digit:]]*://;s/\[/./g;s/\]/./g' \ + > "$outfile" +} + +## +# Crawl workflows, runs, and jobs for the master and current branch. +# While crawling, download assets and logs, and if a run failed, diff +# that log against the last successful master-equivalent having the same +# workflow and job type. +# +# TODO - Refactor into smaller functions and trying to flatten the loop depth. +# TODO - Improve the log differ to something that can lift out just the +# gcc/clang/vistual-studio warnings and errors, and diff them. +# +function main() { + # Setup all pre-requisites + init + echo "Comparing branch $local_branch with master" + echo "" + + # Fetch the workflows, to be used throughout the run + fetch_workflows + + # Step through each workflow + for workflow_id in "${!workflows[@]}"; do + workflow_name="${workflows[$workflow_id]}" + echo -e "${bold}${workflow_name}${reset} workflow [$workflow_id]" + + # Within the workflows, we're interested in finding the newest subset of + # runs that match our current branch as well as the master branch. + for branch in master current; do + if [[ "$branch" == "current" ]]; then + branch_name="$local_branch" + else + branch_name="master" + fi + + # Get the run identifier for the given workflow and branch + fetch_workflow_run "$workflow_id" "$branch_name" + if [[ -z "${run_id:-}" ]]; then + echo " \`- no runs found for $branch_name" + continue + fi + [[ "$branch" == "master" ]] && joiner="|-" || joiner="\`-" + echo " $joiner found latest $branch_name run [$run_id]" + + # Download the artifacts produced during the selected run + fetch_run_artifacts + [[ "$branch" == "master" ]] && joiner="|" || joiner=" " + for artifact_name in "${!artifacts[@]}"; do + artifact_url="${artifacts[$artifact_name]}" + asset_file="$assets_dir/$workflow_name-$artifact_name-$branch.zip" + download "$artifact_url" "$asset_file" + unpack "$asset_file" + echo -e " $joiner - unpacking ${cyan}${artifact_name}${reset} asset" + done + + # Download the logs for the jobs within the selected run + for conclusion in failure success; do + if ! fetch_run_jobs "$conclusion"; then + echo " \`- skipped $job_id $conclusion" + continue + fi + [[ "$conclusion" == "success" ]] && color="${green}" || color="${red}" + for job_id in "${!jobs_array[@]}"; do + job_name="${jobs_array[$job_id]}" + echo -e " $joiner - fetching ${color}${job_name}${reset} ${conclusion} log" + log_file="$logs_dir/$workflow_name-$job_name-$branch-$conclusion.log" + successful_master_log="$logs_dir/$workflow_name-$job_name-master-success.log" + fetch_job_log "$job_id" "$log_file" + + # In the event we've found a failed job, try to diff it against a prior + # successful master job of the equivalent workflow and job-type. + if [[ "$conclusion" == "failure" + && -f "$log_file" + && -f "$successful_master_log" ]]; then + sanitized_branch_name="$(sanitize_name "$branch_name")" + diff_file="$diffs_dir/$workflow_name-$job_name-$sanitized_branch_name-vs-master.log" + diff "$log_file" "$successful_master_log" > "$diff_file" || true + echo -e " - diffed ${yellow}$diff_file${reset}" + fi + done # jobs_array + done # conclusion types + echo " $joiner" + done # branch types + done # workflows +} + +main