From 25c4835b87252011c10be326ee0b3939a2d54329 Mon Sep 17 00:00:00 2001 From: josch Date: Fri, 4 Jul 2014 10:11:46 +0200 Subject: [PATCH] initial commit --- README | 11 +++++++++++ filenameext.sh | 36 ++++++++++++++++++++++++++++++++++++ urlencode.py | 23 +++++++++++++++++++++++ 3 files changed, 70 insertions(+) create mode 100644 README create mode 100755 filenameext.sh create mode 100755 urlencode.py diff --git a/README b/README new file mode 100644 index 0000000..93136b4 --- /dev/null +++ b/README @@ -0,0 +1,11 @@ +When using `wget -rk` to locally mirror a website, wget misses to do some +things which these two scripts fix: + + - if a url contains a GET query string like + http://mysite.com/foo?bar=baz&blub=bla then a file with name + `foo?bar=baz&blub=bla` gets saved. The links from other documents to this + file do not get urlencoded. This is fixed by `urlencode.py`. + - if the copy is put online then the webserver will most likely determine the + content type of the static content by the filename extension. But files that + are saved under a name like `foo?bar=baz&blub=bla` do not have a recognized + extension. This is fixed by `filenameext.sh`. diff --git a/filenameext.sh b/filenameext.sh new file mode 100755 index 0000000..fecbc6c --- /dev/null +++ b/filenameext.sh @@ -0,0 +1,36 @@ +#!/bin/sh + +for f in *; do + echo $f + # skip directories + [ -d $f ] && continue + case `file --mime-type $f | awk '{print $2}'` in + application/gzip) ext="gz";; + application/pdf) ext="pdf";; + application/x-bzip2) ext="bz2";; + application/x-debian-package) ext="deb";; + application/x-dosexec) ext="exe";; + application/x-rar) ext="rar";; + application/zip) ext="zip";; + audio/mpeg) ext="mp3";; + audio/x-wav) ext="wav";; + binary) ext="wav";; + image/gif) ext="gif";; + image/jpeg) ext="jpg";; + image/png) ext="png";; + image/x-ms-bmp) ext="bmp";; + text/html) ext="html";; + text/plain) ext="txt";; + text/x-tex) ext="tex";; + video/mpeg) ext="mpg";; + video/x-msvideo) ext="avi";; + esac + # do not handle this file if the extension already matches + case $f in + *$ext) continue + esac + mv ${f} ${f}.${ext} + # now replace & and ? in f + f=`echo "$f" | sed 's/?/%3F/g; s/&/%26/g'` + perl -pi -e "s/\\Q${f}\\E([#\"])/${f}.${ext}\1/g" *php* +done diff --git a/urlencode.py b/urlencode.py new file mode 100755 index 0000000..3809cbf --- /dev/null +++ b/urlencode.py @@ -0,0 +1,23 @@ +#!/usr/bin/env python3 + +import re +import sys + +for filename in sys.argv[1:]: + try: + with open(filename) as f: + data = f.read() + except UnicodeDecodeError: + continue + + def aux(match): + s = match.group(0) + s = s.replace("?", "%3F") + s = s.replace("&", "%26") + return s + + data = re.sub(r'href="[^"]+\.php[^"]*"', aux, data) + data = re.sub(r'src="[^"]+\.php[^"]*"', aux, data) + + with open(filename, "w") as f: + f.write(data)