#!/bin/bash # # DIY link checker # # - parsing an url for links # - checking if those are alive # [[ -z $1 ]] && echo single url? && exit 1 target=$1 # looking for http/s links #urls=`curl -s $target | egrep -o 'http[s]?://[^ "<$]+' | sort -u` # looking for hrefs # TODO also deal with href=' # TODO check that g works -- multiple occurences per line urls=`curl -s $target | sed -rn 's@.*[Hh][Rr][Ee][Ff]="([^"]+)".*@\1@gp' | sort -u` # beware of the carriage return after 200 OK # and **the space** and carriage return after HTTP/2 200 # TODO check ^http probably does ^t ^p for url in $urls; do # deal with relative pathes [[ $url =~ ^[^http] && $url =~ ^[^/] ]] && url=${target%/*}/$url # deal with absolute pathes but relative to the domain [[ $url =~ ^/ ]] && url=${target%%/*}$url header=`curl -s -Il $url | head -1` [[ "$header" != "HTTP/1.1 200 OK " \ && "$header" != "HTTP/2 200 " ]] && echo $url ${header} unset header done; unset url