#!/bin/ksh # # DIY links checker # debug=0 [[ -z $1 ]] && print single url? && exit 1 target=`echo "$1" | sed -r 's@.*://@@'` # looking for http/s links #urls=`curl -s $target | egrep -o 'http[s]?://[^ "<$]+' | sort -u` # looking for hrefs # TODO also deal with href=' # TODO check that g works -- multiple occurences per line urls=`curl -A "Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A372 Safari/604.1" --max-time 4 -sl $target | sed -rn 's@.*[Hh][Rr][Ee][Ff]="([^"]+)".*@\1@gp' | sort -u` #(( debug == 1 )) && print urls are "$urls" # beware of the carriage return after 200 OK # and **the space** and carriage return after HTTP/2 200 # TODO check ^http probably does ^t ^p for url in $urls; do if [[ $url = http* ]]; then (( debug == 1 )) && print full url goes as $url elif [[ $url = /* ]]; then url=${target%%/*}$url (( debug == 1 )) && print absolute link goes as $url elif [[ $url = [!/]* ]]; then url=${target%/*}/$url (( debug == 1 )) && print relative link goes as $url else print Error: could not parse URL $url exit 1 fi header=`curl -A "Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A372 Safari/604.1" --max-time 4 -sIl $url | head -1` (( debug == 1 )) && print header is $header [[ -z $header ]] && print $url DEAD OR TOO SLOW && continue [[ "$header" != "HTTP/1.1 200 OK " && "$header" != "HTTP/2 200 " && "$header" != "HTTP/1.1 302 Found " ]] && print $url $header unset header done; unset url