forked from andonome/lk
		
	add example pdf-to-text script
This commit is contained in:
		| @@ -18,3 +18,4 @@ Arch: tesseract-data-eng and poppler-utils | |||||||
| > tesseract -l eng  "$x" - >> *out*.txt | > tesseract -l eng  "$x" - >> *out*.txt | ||||||
| > done | > done | ||||||
|  |  | ||||||
|  | - [Example script](data/pdf-to-txt.sh) | ||||||
|   | |||||||
							
								
								
									
										25
									
								
								data/pdf-to-txt.sh
									
									
									
									
									
										Executable file
									
								
							
							
						
						
									
										25
									
								
								data/pdf-to-txt.sh
									
									
									
									
									
										Executable file
									
								
							| @@ -0,0 +1,25 @@ | |||||||
|  | #!/bin/bash | ||||||
|  |  | ||||||
|  | pdftoppm -png input.pdf page | ||||||
|  |  | ||||||
|  | for x in *png; do | ||||||
|  | 	tesseract -l eng  "$x" - >> out.tex | ||||||
|  | done | ||||||
|  |  | ||||||
|  | rm *png | ||||||
|  |  | ||||||
|  | sed -i -ze :a -e 's/\([a-z]\)\(-\)\n\+\([a-zA-Z]\)/\1\3/g' out.tex | ||||||
|  | sed -i -ze :a -e 's/\([a-z]\)\n\+\([a-zA-Z]\)/\1 \2/g' out.tex | ||||||
|  | sed -i -ze :a -e 's/\([A-Z]\){3}\+\n/\1 XYZ/g' out.tex | ||||||
|  | sed -i -ze :a -e 's/\n\([A-Z]\{3\}\+\)\n/\\section{\1}\n/g' out.tex | ||||||
|  | sed -i -ze :a -e 's/\([a-z]\)\. \([A-Z]\)/\1\.\n\2/g' out.tex | ||||||
|  |  | ||||||
|  | sed -i 's/“//g' out.tex | ||||||
|  | sed -i "s/”/''/g" out.tex | ||||||
|  | sed -i "s/’/'/g" out.tex | ||||||
|  | sed -i "s/‘/'/g" out.tex | ||||||
|  | sed -i "s/\.''/''\./g" out.tex | ||||||
|  | sed -i "s/ — / -- /g" out.tex | ||||||
|  | sed -i 's/\$/\\$/g' out.tex | ||||||
|  | sed -i 's/%/\\%/g' out.tex | ||||||
|  | sed -i 's/&/\\&/g' out.tex | ||||||
		Reference in New Issue
	
	Block a user