From dda08f40e9abd9a6c6c34072b28865ae0bc1b714 Mon Sep 17 00:00:00 2001 From: Malin Freeborn Date: Mon, 12 Sep 2022 23:01:54 +0200 Subject: [PATCH] add example pdf-to-text script --- data/pdf-to-txt.md | 1 + data/pdf-to-txt.sh | 25 +++++++++++++++++++++++++ 2 files changed, 26 insertions(+) create mode 100755 data/pdf-to-txt.sh diff --git a/data/pdf-to-txt.md b/data/pdf-to-txt.md index 40ae28c..92de8b8 100644 --- a/data/pdf-to-txt.md +++ b/data/pdf-to-txt.md @@ -18,3 +18,4 @@ Arch: tesseract-data-eng and poppler-utils > tesseract -l eng "$x" - >> *out*.txt > done +- [Example script](data/pdf-to-txt.sh) diff --git a/data/pdf-to-txt.sh b/data/pdf-to-txt.sh new file mode 100755 index 0000000..c6da2a9 --- /dev/null +++ b/data/pdf-to-txt.sh @@ -0,0 +1,25 @@ +#!/bin/bash + +pdftoppm -png input.pdf page + +for x in *png; do + tesseract -l eng "$x" - >> out.tex +done + +rm *png + +sed -i -ze :a -e 's/\([a-z]\)\(-\)\n\+\([a-zA-Z]\)/\1\3/g' out.tex +sed -i -ze :a -e 's/\([a-z]\)\n\+\([a-zA-Z]\)/\1 \2/g' out.tex +sed -i -ze :a -e 's/\([A-Z]\){3}\+\n/\1 XYZ/g' out.tex +sed -i -ze :a -e 's/\n\([A-Z]\{3\}\+\)\n/\\section{\1}\n/g' out.tex +sed -i -ze :a -e 's/\([a-z]\)\. \([A-Z]\)/\1\.\n\2/g' out.tex + +sed -i 's/“//g' out.tex +sed -i "s/”/''/g" out.tex +sed -i "s/’/'/g" out.tex +sed -i "s/‘/'/g" out.tex +sed -i "s/\.''/''\./g" out.tex +sed -i "s/ — / -- /g" out.tex +sed -i 's/\$/\\$/g' out.tex +sed -i 's/%/\\%/g' out.tex +sed -i 's/&/\\&/g' out.tex