Add a script to fetch and diff GitHub workflow logs

2020-01-29 22:48:19 -08:00 · 2020-01-29 22:48:19 -08:00 · 94cd6df6cd
commit 94cd6df6cd
parent 89e9e7336f
1 changed files with 471 additions and 0 deletions
--- a/scripts/diff-workflow-against-master.sh
+++ b/scripts/diff-workflow-against-master.sh
@ -0,0 +1,471 @@
+#!/bin/bash
+
+# Copyright (C) 2020  Kevin R Croft <krcroft@gmail.com>
+# SPDX-License-Identifier: GPL-2.0-or-later
+
+##
+#  This script craws the current repo's GitHub workflow content.
+#  It fetches assets and logs from the most recent successful master
+#  run followed by the most recent (and possibly failing) current
+#  branch, which can also be a master branch.
+#  The goal of this script is two fold:
+#    - Provide a mechanized an automated way to fetch CI records.
+#    - Provide a rapid way to diff bad CI runs against master.
+#
+#  This script requires a GitHub account in order to generate an
+#  auth-token. Simply run the script, it will provide instructions.
+#
+set -euo pipefail
+shopt -s nullglob
+
+# Fixed portion of the URL
+declare -gr scheme="https://"
+declare -gr authority="api.github.com"
+
+# Colors
+declare -gr bold="\\e[1m"
+declare -gr red="\\e[31m"
+declare -gr green="\\e[32m"
+declare -gr yellow="\\e[33m"
+declare -gr cyan="\\e[36m"
+declare -gr reset="\\e[0m"
+
+##
+#  Changes the working directory to that of the
+#  repository's root.
+#
+function cd_repo_root() {
+	if [[ "${in_root:-}" != "true" ]]; then
+		script_path="$( cd "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P )"
+		pushd "$script_path" > /dev/null
+		pushd "$(git rev-parse --show-toplevel)" > /dev/null
+		in_root=true
+	fi
+}
+
+##
+#  Determines the full GitHub repo name using the
+#  remote origin set in the repository's configuration
+#
+function init_baseurl() {
+	cd_repo_root
+	# Extract the full GitHub repo name from the origin
+	repo="$(git config --get remote.origin.url | sed 's/.*://;s/\.git$//;s^//[^/]*/^^')"
+	baseurl="$scheme/$authority/repos/$repo/actions"
+	declare -gr repo
+	declare -gr baseurl
+}
+
+##
+#  Determines the local branch name
+#
+function init_local_branch() {
+	cd_repo_root
+	local_branch="$(git rev-parse --abbrev-ref HEAD)"
+	declare -gr local_branch
+}
+
+##
+#  Sets up NETRC credentials for GitHub's v3 API server.
+#  NETRC is currently the most secure way to provide
+#  credentials to CURL, because it prevents other processes
+#  from inspect the process environment and cmd arguments;
+#  both of which can be found in /proc).
+#
+function init_netrc() {
+	# Check and setup the GitHub personal access token
+	local netrc="$HOME/.netrc"
+	if grep -q "^machine $authority" "$netrc"; then
+		echo "Found credentials for $authority in $netrc"
+		return
+	fi
+	# Get the username
+	clear
+	echo ""
+	echo "1. Enter your GitHub account email address, example: jcousteau@scuba.fr"
+	echo ""
+	read -r -p "GitHub account email: " username
+
+	# Get the token
+	clear
+	echo ""
+	echo "2. Login to GitHub and visit https://github.com/settings/tokens"
+	echo ""
+	echo "3. Click 'Generate new token' and set the 'public_repo' permission:"
+	echo ""
+	echo "    [ ]  repo                  Full control of private repos"
+	echo "        [ ] repo:status        Access commit status"
+	echo "        [ ] repo_deployment    Access deployment status"
+	echo "        [X] public_repo        Access public repositories"
+	echo "        [ ] repo:invite        Access repository invitations"
+	echo ""
+	echo "   Type a name for the token then scroll down and click 'Generate Token'."
+	echo ""
+	echo "4. Copy & paste your token, for example: f7e6b2344bd2c1487597b61d77703527a692a072"
+	echo ""
+	# Deliberately echo the token so the user can verify its correctness
+	read -r -p "Personal Access Token: " token
+
+	# Add the credential to netrc
+	clear
+	if [[ -n "${username:-}" && -n "${token:-}" ]]; then
+		{
+			echo "machine $authority"
+			echo "login $username"
+			echo "password $token"
+			echo ""
+		} >> "$netrc"
+		echo "Added your credentials to $netrc"
+		# Ensure netrc is only readable by the user
+		chmod 600 "$netrc"
+	else
+		echo "Failed to setup $netrc or some of the credentials were empty!"
+		exit 1
+	fi
+}
+
+##
+#  Makes strings suitable for directory and filenames
+#   - spaces => underscores
+#   - upper-case => lower-case
+#   - slashes => dashes
+#   - parenthesis => stripped
+#   - equals  => dashes
+#
+function sanitize_name() {
+	local name="$1"
+	echo "$name" | sed 's/\(.*\)/\L\1/;s/ /_/g;s^/^-^g;s/[()]//g;s/=/-/g'
+}
+
+##
+#  Returns how old a file or directory is, in seconds.
+#
+function seconds_old() {
+	echo $(( "$(date +%s)" - "$(stat -L --format %Y "$1")" ))
+}
+
+##
+#  Creates a storage area for all content fetched by the script.
+#  This include cached JSON output (valid for 5 minutes), along
+#  with zip assets, log files, and diffs.
+#
+declare -g parent # used by the trap
+function init_dirs() {
+	local repo_postfix
+	repo_postfix="$(basename "$repo")"
+	parent="/tmp/$repo_postfix-workflows-$USER"
+	assets_dir="$parent/assets"
+	cache_dir="$parent/cache"
+	diffs_dir="$parent/diffs"
+	logs_dir="$parent/logs"
+	declare -gr assets_dir
+	declare -gr cache_dir
+	declare -gr diffs_dir
+	declare -gr logs_dir
+	echo "Initializing storage area: $parent"
+
+	# Don't trust content from a prior interrupted run
+	if [[ -f "$parent/.interrupted" ]]; then
+		rm -rf "$parent"
+	fi
+
+	# Make the directories if they don't exist
+	for dir in "$assets_dir" "$cache_dir" "$diffs_dir" "$logs_dir"; do
+		if [[ ! -d "$dir" ]]; then
+			mkdir -p "$dir"
+		# Otherwise, purge content older than 5-minutes
+		else
+			for filename in "$dir"/*; do
+				if [[ "$(seconds_old "$filename")" -gt 300 ]]; then
+					rm -rf "$filename"
+				fi
+			done
+		fi
+	done
+	# If the user Ctrl-C'd the job, then some files might be
+	# partially written, so drop a breadcrumb to clean up next run.
+	# (we could just blow away the content here, but we want to
+	# let the user inspect content after interrupting the run.)
+	trap 'touch $parent/.interrupted' INT
+}
+
+##
+#  Ensures all pre-requisites are setup and have passed
+#  before we start making REST queries and writing files.
+#
+function init() {
+	init_baseurl
+	init_local_branch
+	init_netrc
+	init_dirs
+}
+
+##
+#  Downloads a file if we otherwise don't have it.
+#  (Note that the script on launch cleans up files older than
+#  5 minutes, so most of the time we'll be downloading.)
+#
+function download() {
+	local url="$1"
+	local outfile="$2"
+	if [[ ! -f "$outfile" ]]; then
+		curl --silent     \
+		     --location   \
+		     --netrc      \
+		     "$url"       \
+		     -o "$outfile"
+	fi
+}
+
+##
+#  Unzips files inside their containing directory.
+#  Clobbers existing files.
+#
+function unpack() {
+	local zipfile="$1"
+	local zipdir
+	zipdir="$(dirname "$zipfile")"
+	pushd "$zipdir" > /dev/null
+	unzip -qq -o "$zipfile"
+	rm -f "$zipfile"
+	popd > /dev/null
+}
+
+##
+#  Constructs and fetches REST urls using our personal access
+#  token. Files are additionally hashed based on the REST URL
+#  and cached.  This allows for rapid-rerunning without needing
+#  to hit GitHub's API again (for duplicate requests). This
+#  avoid us exceeding our repo limit on API calls/day.
+#
+function pull() {
+	# Buildup the REST URL by appending arguments
+	local url="$baseurl"
+	for element in "$@"; do
+		url="$url/$element"
+	done
+	local url_hash
+	url_hash="$(echo "$url" | md5sum | cut -f1 -d' ')"
+	local outfile="${cache_dir}/${url_hash}.json"
+	if [[ ! -f "$outfile" ]]; then
+		download "$url" "$outfile"
+	fi
+	cat "$outfile"
+}
+
+##
+#  Gets one or more keys from all records
+#
+function get_all() {
+	local container="$1"
+	local return_keys="$2"
+	jq -r '.'"$container"'[] | '"${return_keys}"
+}
+
+##
+#  Gets one or more return_key(s) from records that have
+#  matching search_key and search_value hits
+#
+function query() {
+	local container="$1"
+	local search_key="$2"
+	local search_value="$3"
+	local return_keys="$4"
+	jq -r --arg value "$search_value"\
+	'.'"${container}"'[] | if .'"${search_key}"' == $value then '"${return_keys}"' else empty end'
+}
+
+##
+#  Pulls the subset of active workflows from GitHub having
+#  path values that match the local repos filenames inside
+#  .github/workflows (otherwise there are 30+ workflows).
+#
+#  The workflow numeric ID and textual name are stored
+#  in an associated array, respectively.
+#
+#  API References:
+#   - https://developer.github.com/v3/actions/workflows/
+#   - GET /repos/:owner/:repo/actions/workflows
+#
+function fetch_workflows() {
+	unset workflows
+	declare -gA workflows
+	for workflow_path in ".github/workflows/"*.yml; do
+		local result
+		result="$(pull workflows \
+	            | query workflows path "$workflow_path" '.id,.name')"
+		local id
+		id="${result%$'\n'*}"
+		local name
+		name="${result#*$'\n'}"
+
+		# Skip empty values and a couple master-only workflows
+		if [[ -z "${id:-}" \
+		   || -z "${name:-}" \
+		   || "$name" == "Config heavy" \
+		   || "$name" == "Coverity Scan" ]]; then
+			continue
+		fi
+		workflows["$id"]="$(sanitize_name "$name")"
+	done
+}
+
+##
+#  Fetches the first run identifier for a given workflow ID
+#  and branch name. The run ID is stored in the run_id variable.
+#
+#  API References:
+#   - https://developer.github.com/v3/actions/workflow_runs
+#   - GET /repos/:owner/:repo/actions/runs/:run_id
+#
+function fetch_workflow_run() {
+	declare -g run_id
+	local workflow_id="$1"
+	local branch="$2"
+	# GET /repos/:owner/:repo/actions/workflows/:workflow_id/runs
+	run_id="$(pull workflows "$workflow_id" runs \
+	       | query workflow_runs head_branch "$branch" '.id' \
+	       | head -1)"
+}
+
+##
+#  Fetches artifact names and download URLs for a given run ID,
+#  and stored them in an assiciative array, respectively.
+#
+#  API References:
+#   - https://developer.github.com/v3/actions/artifacts
+#   - GET /repos/:owner/:repo/actions/runs/:run_id/artifacts
+#
+function fetch_run_artifacts() {
+	unset artifacts
+	declare -gA artifacts
+	while read -r name; do
+		read -r url
+		sanitized_name="$(sanitize_name "$name")"
+		artifacts["$sanitized_name"]="$url"
+	done < <(pull runs "$run_id" artifacts \
+	         | get_all artifacts '.name,.archive_download_url')
+}
+
+##
+#  Fetches the job IDs and job names for a given run ID.
+#  The job IDs and names are stored in an associative array,
+#  respectively.
+#
+#  API References:
+#   - https://developer.github.com/v3/actions/workflow_jobs
+#   - GET /repos/:owner/:repo/actions/runs/:run_id/jobs
+#
+function fetch_run_jobs() {
+	unset jobs_array
+	declare -gA jobs_array
+	local conclusion="$1" # success or failure
+	while read -r id; do
+		read -r name
+		jobs_array["$id"]="$(sanitize_name "$name")"
+	done < <(pull runs "$run_id" jobs \
+	         | query jobs conclusion "$conclusion" '.id,.name')
+}
+
+##
+#  Fetches a job's log, and saves it in the provided output
+#  filename. The logs prefix time-stamps are filtered for easier
+#  text processing.
+#
+#  API References:
+#   - https://developer.github.com/v3/actions/workflow_jobs
+#   - GET /repos/:owner/:repo/actions/jobs/:job_id/logs
+#
+function fetch_job_log() {
+	local jid="$1"
+	local outfile="$2"
+	pull jobs "$jid" logs \
+	| sed 's/^.*Z[ \t]*//;s/:[[:digit:]]*:[[:digit:]]*://;s/\[/./g;s/\]/./g' \
+	> "$outfile"
+}
+
+##
+#  Crawl workflows, runs, and jobs for the master and current branch.
+#  While crawling, download assets and logs, and if a run failed, diff
+#  that log against the last successful master-equivalent having the same
+#  workflow and job type.
+#
+#  TODO - Refactor into smaller functions and trying to flatten the loop depth.
+#  TODO - Improve the log differ to something that can lift out just the
+#         gcc/clang/vistual-studio warnings and errors, and diff them.
+#
+function main() {
+	# Setup all pre-requisites
+	init
+	echo "Comparing branch $local_branch with master"
+	echo ""
+
+	# Fetch the workflows, to be used throughout the run
+	fetch_workflows
+
+	# Step through each workflow
+	for workflow_id in "${!workflows[@]}"; do
+		workflow_name="${workflows[$workflow_id]}"
+		echo -e "${bold}${workflow_name}${reset} workflow [$workflow_id]"
+
+		# Within the workflows, we're interested in finding the newest subset of
+		# runs that match our current branch as well as the master branch.
+		for branch in master current; do
+			if [[ "$branch" == "current" ]]; then
+				branch_name="$local_branch"
+			else
+				branch_name="master"
+			fi
+
+			# Get the run identifier for the given workflow and branch
+			fetch_workflow_run "$workflow_id" "$branch_name"
+			if [[ -z "${run_id:-}" ]]; then
+				echo "  \`- no runs found for $branch_name"
+				continue
+			fi
+			[[ "$branch" == "master" ]] && joiner="|-" || joiner="\`-"
+			echo "  $joiner found latest $branch_name run [$run_id]"
+
+			# Download the artifacts produced during the selected run
+			fetch_run_artifacts
+			[[ "$branch" == "master" ]] && joiner="|" || joiner=" "
+			for artifact_name in "${!artifacts[@]}"; do
+				artifact_url="${artifacts[$artifact_name]}"
+				asset_file="$assets_dir/$workflow_name-$artifact_name-$branch.zip"
+				download "$artifact_url" "$asset_file"
+				unpack "$asset_file"
+				echo -e "  $joiner     - unpacking ${cyan}${artifact_name}${reset} asset"
+			done
+
+			# Download the logs for the jobs within the selected run
+			for conclusion in failure success; do
+				if ! fetch_run_jobs "$conclusion"; then
+					echo "      \`- skipped $job_id $conclusion"
+					continue
+				fi
+				[[ "$conclusion" == "success" ]] && color="${green}" || color="${red}"
+				for job_id in "${!jobs_array[@]}"; do
+					job_name="${jobs_array[$job_id]}"
+					echo -e "  $joiner     - fetching  ${color}${job_name}${reset} ${conclusion} log"
+					log_file="$logs_dir/$workflow_name-$job_name-$branch-$conclusion.log"
+					successful_master_log="$logs_dir/$workflow_name-$job_name-master-success.log"
+					fetch_job_log "$job_id" "$log_file"
+
+					# In the event we've found a failed job, try to diff it against a prior
+					# successful master job of the equivalent workflow and job-type.
+					if [[ "$conclusion" == "failure"
+					&& -f "$log_file"
+					&& -f "$successful_master_log" ]]; then
+						sanitized_branch_name="$(sanitize_name "$branch_name")"
+						diff_file="$diffs_dir/$workflow_name-$job_name-$sanitized_branch_name-vs-master.log"
+						diff "$log_file" "$successful_master_log" > "$diff_file" || true
+						echo -e "        - diffed    ${yellow}$diff_file${reset}"
+					fi
+				done # jobs_array
+			done # conclusion types
+			echo "  $joiner"
+		done # branch types
+	done # workflows
+}
+
+main