Public
Snippet $1 authored by Christian Gerdes

Using WGET as a web crawler/spider in Cygwin/Linux/Unix

wget.sh
# Set up proxy if necessary

export http_proxy=http://proxyvip:8080
export https_proxy=https://proxyvip:8080

# Get the links one level down (--level=1, default 5) using proxy and log to log.txt without downloading (--spider)
wget --spider --force-html --recursive --level=1 http://www.lightsinline.se --proxy-user=p950gec --proxy-password=p950gec -o log.txt

# Grep and awk and sed to filter out the url's to new_urls.dat
cat log.txt | grep " saved " |  awk '{ print $6 }' > new_urls.dat

# Override Host header (for DNS spoofing)
wget --header="Host: www.realhost.com" http://backendhost:port/URI