PDF-Embanner/pdfembannersrc/get_text.py

140 lines
6.3 KiB
Python
Raw Permalink Normal View History

2018-05-18 06:49:19 +00:00
# -*-coding:utf-8 -*
import tkinter as tk
from tkinter import filedialog
from tkinter import messagebox
from tkinter import ttk
from pdfembannersrc import subwindows
import logging
import PyPDF2
import os
logger = logging.getLogger()
class Interface(tk.Toplevel):
"""
Full split interace
"""
def __init__(self, parent, **kwargs):
tk.Toplevel.__init__(self, parent)
self.transient(parent)
self.grab_set()
self.title("PDF Embanner : full split")
self.geometry("800x160")
self.protocol("WM_DELETE_WINDOW", self.close)
self.bind("<Escape>", self.close)
self.file = None
self.f = tk.Frame(self, width=768, height=576, **kwargs)
self.f.pack(fill=tk.BOTH)
# Création de nos widgets
self.f.columnconfigure(1, weight=1)
self.f.rowconfigure(12, weight=1)
tk.Label(self.f, text="Extract text", bg="blue", fg="white", padx=20).grid(row=0, column=0, columnspan=3, sticky=tk.W)
tk.Label(self.f, text="Return the text in PDF in one or several text files").grid(row=1, column=0, columnspan=3, sticky=tk.W)
tk.Button(self.f, text="Open", command=self.open).grid(row=2, column=0)
self.open_label = tk.Label(self.f, text="-" if self.file is None else self.file)
self.open_label.grid(row=2, column=1, columnspan=3, sticky=tk.W)
tk.Button(self.f, text="Extract text in one file", command=self.do_one, fg="blue").grid(row=3, column=3)
tk.Button(self.f, text="Extract text per page", command=self.do_ppage, fg="blue").grid(row=3, column=2)
ttk.Separator(self.f, orient="horizontal").grid(row=5, column=0, columnspan=4, sticky=tk.W+tk.E, padx=5, pady=10)
tk.Button(self.f, text="Close", command=self.close).grid(row=11, column=3)
self.message = tk.Label(self.f, text="Welcome!")
self.message.grid(row=13, column=0, columnspan=4, sticky=tk.W)
def open(self, *args):
ftypes = [('PDF files (Portable Document Format)', '*.pdf'), ('All files', '*')]
fl = filedialog.askopenfilename(filetypes = ftypes)
if fl!='':
self.file = fl
self.open_label["text"] = fl
def do_one(self, *args):
if(self.file is None):
messagebox.showwarning(title="PDF Output", message="Please open the PDF to split before !")
else:
self.message["text"] = "Extracting"
outbasename=self.file[:-4]+".txt"
progress = None
try:
with open(self.file, 'rb') as in_f:
inpdf = PyPDF2.PdfFileReader(in_f)
progress = subwindows.Progress(self, inpdf.getNumPages(), "Producing PDFs...")
progress.message["text"] = 'Reading files'
output = ""
for i in range(inpdf.getNumPages()):
output+=inpdf.getPage(i).extractText() + "\n"
progress.next()
except IOError:
logger.warn("Text extraction : Could not open PDF file {}.".format(self.file))
messagebox.showerror(title="Error",
message="Impossible to open PDF file {}".format(self.file))
except Exception as e:
logger.warn("Text extraction : Unknown error occured during PDF production. {}".format(str(e)))
messagebox.showerror(title="Error",
message="An Error occured :\n{}\nNo output produced!".format(e))
finally:
if(progress is not None):
progress.close()
try:
with open(outbasename, 'w') as out_f:
out_f.write(output)
self.message["text"] = "Done"
except IOError:
logger.warn("Text extraction : Could not open output file {}.".format(outbasename))
messagebox.showerror(title="Error",
message="Impossible to open output file {}".format(outbasename))
except Exception as e:
logger.warn("Text extraction : Unknown error occured during PDF production. {}".format(str(e)))
messagebox.showerror(title="Error",
message="An Error occured :\n{}\nNo output produced!".format(e))
def do_ppage(self, *args):
if(self.file is None):
messagebox.showwarning(title="PDF Output", message="Please open the PDF to split before !")
else:
self.message["text"] = "Splitting"
outfoldername=self.file[:-4]
outbasename=self.file[:-4]+"/"+self.file.split('/')[-1][:-4]
try:
os.mkdir(outfoldername)
except OSError as e:
logger.warn("Text extraction : Could not create folder {} :: {}".format(outfoldername, e))
messagebox.showerror(title="Error",
message="Folder {} already exists or could not be created".format(outfoldername))
progress = None
try:
with open(self.file, 'rb') as in_f:
inpdf = PyPDF2.PdfFileReader(in_f)
progress = subwindows.Progress(self, inpdf.getNumPages(), "Producing PDFs...")
progress.message["text"] = 'Reading files'
for i in range(inpdf.getNumPages()):
output = inpdf.getPage(i).extractText()
progress.next()
with open("{}_{:03d}.txt".format(outbasename, i), 'w') as out_f:
out_f.write(output)
self.message["text"] = "Done"
except IOError:
logger.warn("Text extraction : Could not open one of the files.")
messagebox.showerror(title="Error",
message="IO Error occured :\nImpossible to open one of the files\nNo output produced!")
except Exception as e:
logger.warn("Text extraction : Unknown error occured during PDF production. {}".format(str(e)))
messagebox.showerror(title="Error",
message="An Error occured :\n{}\nNo output produced!".format(e))
finally:
if(progress is not None):
progress.close()
def close(self, *args):
self.destroy()