From 144a5c8c2d15c61b3d98c05f46642406a7bc2ef7 Mon Sep 17 00:00:00 2001 From: Johannes 'josch' Schauer Date: Mon, 30 Jan 2017 11:21:46 +0100 Subject: [PATCH] initial commit --- README.md | 125 +++++++++++++++++++ base64.conf | 3 + create_training_set.sh | 20 ++++ letter_template.svg | 61 ++++++++++ run.py | 265 +++++++++++++++++++++++++++++++++++++++++ 5 files changed, 474 insertions(+) create mode 100644 README.md create mode 100644 base64.conf create mode 100644 create_training_set.sh create mode 100644 letter_template.svg create mode 100755 run.py diff --git a/README.md b/README.md new file mode 100644 index 0000000..7cd0cf6 --- /dev/null +++ b/README.md @@ -0,0 +1,125 @@ +Introduction +------------ + +Back when I was young and innocent, I created my GPG key for Debian using +subkeys according to this article: https://wiki.debian.org/Subkeys + +I then stored the master key pair on a usb drive and (just in case) printed a +base64 encoded gzipped tarball of the $GNUPGHOME containing the master key to +five pages of A4 paper. Back then I didn't find any better solution to store +binary data on paper so since I didn't expect I needed it ever anyways, I went +for the simplest solution. + +Fast forward four years I wanted to sign the key of a friend for which I need +my master key. So I got the usb stick out of its closet and plugged it in... +The usb stick was bust. No reaction anymore. No idea what happened. + +So now I was lost with five pages of base64 as the only remaining artifact of +my secret master key. + +Key Recovery +------------ + +Manually typing five pages with 4332 characters each is no fun, so even though +it would probably take longer to do so (but would be more fun) I investigated +how to automatically do it. + +Complete OCR solutions like tesseract already exist and while they are +performing incredibly well for real text, they do not perform well on my input +data because they expect the text to be real human language and because they +are trained for a wide variety of fonts, making them less precise for the +specific font I used. I could train tesseract with my specific font but then I +would learn more about how to use tesseract than how to teach a machine how to +read base64 text. + +After some trial and error I found out that I apparently had printed the five +pages in question with "Liberation Mono". After lots of trial and error to find +the right approach, I settled on the following method: + + 1. find the minimum rectangle around the text + 2. rotate the image so that the text is exactly evenly aligned + 3. divide the text into the required raster of letters (gladly I used a monospaced font to print) + 4. attempt to recognize each letter + 5. present the result to the user, one letter at a time, comparing the scanned original and the guessed letter, allowing to make corrections + 6. store the result as a text file + +Improving character recognition +------------------------------- + +I started with training my k-nearest neighbors model with blurred synthetic +images that I generated from SVGs that showed one base64 letter in "Liberation +Mono" each. It seems that there is no simple way to put letters into a bitmap +which preserves the same font baseline, thus I used SVG for the task. + +With that training set, the k-nearest neighbors algorithm was able to correctly +detect 40 out of 43 characters. The detection rate dramatically increased once +I fed the actually scanned characters from the first page into the training set +for detecting characters on the second page. After having scanned all five +pages, the k-nearest algorithm made less than 5 errors per 10000 pages. +Specifically it had problems differentiating 0 from O. + +Finding the needle in the haystack +---------------------------------- + +Even with such a high recognition rate, the project still fails unless every +last character has been read in correctly. Only one bit swapped will lead to a +useless secret key. The problem here was how to determine when I was actually +done with the translation. The OCR engine can make errors but so can I when I +check the result. In the worst case I'd never get any indication that there is +still something wrong. + +I didn't end up storing any checksum of the data I printed, which was a big +mistake. On the other hand, I was lucky that what I stored was gzipped data and +the gzip format contains a CRC. Thus, using `gzip -dtv` I was able to check +whether the data I had decoded was correct or whether an error was still hiding +somewhere. + +It would've helped a lot if I had stored my data in a way that would allow few +errors to be present and still be able to recover the original data. + +Storing binary data on paper +---------------------------- + +Now I'm done with recovering my key and I have some software that does the job +nearly automatically. The underlying problem though is still not solved today, +four years later. + +A bunch of software exists which promises to solve this problem but they are +each not very popular and if I trust some data to a medium that is gonna surive +decades then I want to be sure that I can still process the data decades later +and that the tool to do so hasn't vanished by then. + +Using base64 is attractive because one can be certain that the letters can +still be read by *some* method at any point in the future. Unfortunately, some +characters are hard to distinguish from each other, so using a much smaller +subset would probably make more sense but also be much more wasteful. + +There is the problem of error correction. There exists the PAR2 format and the +pyFileFixity tool but they both require knowledge of the exact algorithm to +make sense of the stored codes. There are qrcodes but they are limited in the +amount of data that they can store and it seems as if they are also limited to +store text only (no arbitrary bytes) and thus data has to be encoded again +before handing them to the qrcode generator. + +Some helpful resources: + + - http://blog.liw.fi/posts/qr-backup/ + - http://www.ollydbg.de/Paperbak/index.html + - https://github.com/lrq3000/pyFileFixity + +Lessons learned +--------------- + + - A backup for which you didn't make sure that you can read it back in is useless. + + - Do not encode information as base64 on paper. Some characters are just too similar. + + - Using a monospaced font helps a lot. + + - OpenCV character recognition using CvKNearest performs exceptionally well, given a good training set + + - Always store a checksum of the data you print + + - Use error correction methods (reed-solomon) + + - There still is no good way to store arbitrary binary data reliably and future-proof on paper diff --git a/base64.conf b/base64.conf new file mode 100644 index 0000000..c2b61bb --- /dev/null +++ b/base64.conf @@ -0,0 +1,3 @@ +# setPageSegMode PSM_SINGLE_CHAR +tessedit_pageseg_mode 10 +tessedit_char_whitelist ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/ diff --git a/create_training_set.sh b/create_training_set.sh new file mode 100644 index 0000000..9565e0d --- /dev/null +++ b/create_training_set.sh @@ -0,0 +1,20 @@ +#!/bin/sh + +i=0 +for l in A B C D E F G H I J K L M N O P Q R S T U V W X Y Z a b c d e f g h i j k l m n o p q r s t u v w x y z 0 1 2 3 4 5 6 7 8 9 + /; do +#for l in A; do + sed 's|%LETTER%|'$l'|' letter_template.svg > letter.svg + dir=training_set/$(printf %02d $i) + mkdir -p $dir + j=0 + for deg in 0; do + for x in -3 -2 -1 +0 +1 +2 +3; do + for y in -3 -2 -1 +0 +1 +2 +3; do + #convert -background white -alpha remove -rotate $deg -blur 0x4 -gravity Center -crop 53x99+0+0 letter.svg $dir/$j.png + convert letter.svg -page $x$y -background white -alpha remove -flatten -blur 0x4 $dir/$(printf %03d $j).png + j=$((j+1)) + done + done + done + i=$((i+1)) +done diff --git a/letter_template.svg b/letter_template.svg new file mode 100644 index 0000000..8f88e51 --- /dev/null +++ b/letter_template.svg @@ -0,0 +1,61 @@ + + + + + + image/svg+xml + + + + + + + + + %LETTER% + + diff --git a/run.py b/run.py new file mode 100755 index 0000000..8078ff4 --- /dev/null +++ b/run.py @@ -0,0 +1,265 @@ +#!/usr/bin/env python + +from __future__ import print_function +from PIL import Image, ImageTk +import numpy as np +import cv2 +import math +import Tkinter +import subprocess +import tempfile +import os +import sys +from itertools import izip_longest + +def grouper(n, iterable, padvalue=None): + "grouper(3, 'abcdefg', 'x') --> ('a','b','c'), ('d','e','f'), ('g','x','x')" + return izip_longest(*[iter(iterable)]*n, fillvalue=padvalue) + +letters_as_img=[] + +page_fname = sys.argv[1] + +image = cv2.imread(page_fname) +img_gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) + +mean = img_gray.mean() +cols, rows = np.where(img_gray <= mean) +a = np.column_stack((cols, rows)) +center, size, angle = cv2.minAreaRect(a) + +rot = False +if angle > 45: + angle -= 90 + size = (size[1], size[0]) + rot = True +elif angle < -45: + angle += 90 + size = (size[1], size[0]) + rot = True + +rotation_mat = cv2.getRotationMatrix2D((0,0), -angle, 1.) +radians = math.radians(angle) +sin = math.sin(radians) +cos = math.cos(radians) +height, width = image.shape[:2] +bound_w = bound_h = int(math.sqrt(height**2+width**2)*2) +if rot == False: + rotation_mat[1,2] = -sin*width +else: + rotation_mat[0,2] = cos*height + +print("rotating image %f degrees" % (-angle)) +image = cv2.warpAffine(image, rotation_mat, (bound_w, bound_h), flags=cv2.INTER_CUBIC) +bordersize = 2 +radians = math.radians(-angle) +sin = math.sin(radians) +cos = math.cos(radians) +new_center = (cos*center[0]-sin*center[1]+rotation_mat[1,2],sin*center[0]+cos*center[1]+rotation_mat[0,2]) +#Image.fromarray(image).save("out.png") +image = image[int(new_center[0]-size[0]/2)-bordersize:int(new_center[0]+size[0]/2)+bordersize, + int(new_center[1]-size[1]/2)-bordersize:int(new_center[1]+size[1]/2)+bordersize] +#Image.fromarray(image).show() +#exit() +height, width = image.shape[:2] + +rows = 57 +#rows = 30 +cols = 76 + +letter_height = (height-bordersize*2)/float(rows) +#letter_height = (height-bordersize*2)/30.0 +letter_width = (width-bordersize*2)/float(cols) +print("letter size: %f x %f"%(letter_width, letter_height)) + +#for row in range(30): +for row in range(rows): + for col in range(cols): + miny = int(row*letter_height) + maxy = int((row+1)*letter_height)+2*bordersize + if maxy > height: + maxy = height + minx = int(col*letter_width) + maxx = int((col+1)*letter_width)+2*bordersize + if maxx > width: + maxx = width + letter = image[miny:maxy,minx:maxx] + letters_as_img.append(Image.fromarray(letter)) + + +letters = [None]*len(letters_as_img) + +tobase64 = [ 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', + 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', + 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', + 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', + 'z', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/'] + +print("reading from training_set") + +samples = [] +responses = [] +for i in range(64): + directory="training_set/%02d"%i + for f in os.listdir(directory): + path=os.path.join(directory, f) + if not os.path.isfile(path): + continue + # do not train with data collected from the current page + if "real-"+page_fname in f: + continue + im = cv2.imread(path, cv2.IMREAD_GRAYSCALE) + im = cv2.resize(im, (50,100)) + im = im.reshape((1,5000)) + samples.append(im) + responses.append(i) + +samples = np.reshape(samples, newshape=(len(responses), 5000)) +samples = samples.astype(np.float32) +responses = np.array(responses,np.float32) +responses = responses.reshape((responses.size,1)) + +print("training model") + +model = cv2.KNearest() +model.train(samples,responses) + +if os.path.exists(page_fname+".txt"): + print("loading existing results from file") + decoded = [] + with open(page_fname+".txt") as f: + for line in f: + line = line.strip() + line = line.ljust(cols) + decoded.extend(list(line)) + if len(decoded) == len(letters): + letters = decoded + letters = [l if l != '?' else None for l in letters] + else: + raise Exception("diff. lens: ", len(decoded), len(letters)) + if None not in letters: + print("page is fully recovered") + print("printing sloppy OCR characters") + for i,img in enumerate(letters_as_img): + if i%100 == 0: + print((i*100)/len(letters_as_img)) + + im = np.asarray(img.convert('L')) + im = cv2.resize(im, (50,100)) + im = im.reshape((1,5000)) + im = np.float32(im) + letter_knearest = tobase64[int(model.find_nearest(im, k=1)[0])] + + if letter_knearest != letters[i]: + print("wrongly detected a %s as a %s" % (letters[i], letter_knearest)) + print("storing it as training samples") + for i,(letter,im) in enumerate(zip(letters,letters_as_img)): + im.save("training_set/%02d/real-%s-%04d.png"%(tobase64.index(letter), page_fname, i)) + +else: + for i,img in enumerate(letters_as_img): + if i%100 == 0: + print((i*100)/len(letters_as_img)) + if letters[i] is not None: + continue + #fh = tempfile.NamedTemporaryFile(mode='w', suffix=".png", delete=False) + #fname = fh.name + #fh.close() + #img.save(fname) + #letter_tess = subprocess.check_output(['tesseract', fname, 'stdout', 'base64.conf']) + #letter_tess = letter_tess.strip() + #os.unlink(fname) + + im = np.asarray(img.convert('L')) + im = cv2.resize(im, (50,100)) + im = im.reshape((1,5000)) + im = np.float32(im) + letter_knearest = tobase64[int(model.find_nearest(im, k=1)[0])] + + #if letter_tess == letter_knearest: + # letters[i] = letter_tess + letters[i] = letter_knearest + + +root = Tkinter.Tk() +letters_as_tk = [ImageTk.PhotoImage(i.resize((100, 200), Image.ANTIALIAS)) for i in letters_as_img] +index_label0 = Tkinter.Label(root) +index_label0.grid(row=0, column=0) +picture_display0 = Tkinter.Label(root) +picture_display0.grid(row=0, column=1) +letter_label0 = Tkinter.Label(root, font=("Liberation Mono", 132)) +letter_label0.grid(row=0, column=2) +index_label1 = Tkinter.Label(root) +index_label1.grid(row=1, column=0) +picture_display1 = Tkinter.Label(root) +picture_display1.grid(row=1, column=1) +letter_label1 = Tkinter.Label(root, font=("Liberation Mono", 132)) +letter_label1.grid(row=1, column=2) +index_label2 = Tkinter.Label(root) +index_label2.grid(row=2, column=0) +picture_display2 = Tkinter.Label(root) +picture_display2.grid(row=2, column=1) +letter_label2 = Tkinter.Label(root, font=("Liberation Mono", 132)) +letter_label2.grid(row=2, column=2) + +current_index = 0 + +def display_index(idx): + if idx > 0: + picture_display0.config(image=letters_as_tk[idx-1]) + letter_label0.config(text=letters[idx-1] if letters[idx-1] is not None else "") + index_label0.config(text="%d" % (idx-1)) + else: + picture_display0.config(image=None) + letter_label0.config(text="") + index_label0.config(text="") + picture_display1.config(image=letters_as_tk[idx]) + letter_label1.config(text=letters[idx] if letters[idx] is not None else "") + index_label1.config(text="%d" % idx) + if idx < len(letters_as_img)-1: + picture_display2.config(image=letters_as_tk[idx+1]) + letter_label2.config(text=letters[idx+1] if letters[idx+1] is not None else "") + index_label2.config(text="%d" % (idx+1)) + else: + picture_display2.config(image=None) + letter_label2.config(text="") + index_label2.config(text="") + +display_index(current_index) + +def printKey(e): + global current_index + if e.keysym == 'Tab': + # jump to next unknown entry + while current_index < len(letters)-1 and letters[current_index] is not None: + current_index += 1 + display_index(current_index) + return + if e.keysym == 'Escape': + lines = grouper(cols, letters, padvalue='_') + print("saving status as " + page_fname + ".txt") + with open(page_fname+".txt", 'w') as f: + for line in lines: + print(''.join([c if c is not None else "?" for c in line]), file=f) + root.quit() + return + if e.keysym in ['Space', 'Right', 'space', "Enter"]: + current_index += 1 + if current_index >= len(letters_as_img): + current_index = len(letters_as_img) - 1 + display_index(current_index) + return + if e.keysym in ['BackSpace', 'Left']: + current_index -= 1 + if current_index <= 0: + current_index = 0 + display_index(current_index) + return + if e.char in tobase64: + letters[current_index] = e.char + current_index += 1 + if current_index >= len(letters_as_img): + current_index = len(letters_as_img) - 1 + display_index(current_index) +root.bind("", printKey) +root.mainloop()