#!/bin/bash
#
# DIY link checker
#
# - parsing an url for links
# - checking if those are alive
#

[[ -z $1 ]] && echo single url? && exit 1
target=$1

# looking for http/s links
#urls=`curl -s $target | egrep -o 'http[s]?://[^ "<$]+' | sort -u`

# looking for hrefs
# TODO also deal with href='
# TODO check that g works -- multiple occurences per line
urls=`curl -s $target | sed -rn 's@.*[Hh][Rr][Ee][Ff]="([^"]+)".*@\1@gp' | sort -u`

# beware of the carriage return after 200 OK
# and **the space** and carriage return after HTTP/2 200
# TODO check ^http probably does ^t ^p
for url in $urls; do
        # deal with relative pathes
        [[ $url =~ ^[^http] && $url =~ ^[^/] ]] && url=${target%/*}/$url

	# deal with absolute pathes but relative to the domain
	[[ $url =~ ^/ ]] && url=${target%%/*}$url

	header=`curl -s -Il $url | head -1`

	[[ "$header" != "HTTP/1.1 200 OK" \
	&& "$header" != "HTTP/2 200 " ]] && echo $url ${header}
	unset header
done; unset url