Grouping read out data using TesserOCR

Question

I'm using TesserOCR to read out CV's and it's working wonderfully really. I was wondering however if it's somehow possible to somehow split up into blocks. What i mean by this is that it would read it like this

personal info ENTER Work experience ENTER education ENTER

you get the gist. This would make it easier for further processing. Does anyone have any experience with this library or tesseract in general that could help me? (excuse my shitty work in progress code :P)

# imports
import io
import os
import re
import string
import time
from concurrent.futures import ProcessPoolExecutor
from functools import partial
from tkinter import filedialog as fd


import cv2.cv2 as cv2
import langdetect
import numpy as np
import pdfplumber
import tesserocr
from PIL import Image
from PIL.PpmImagePlugin import PpmImageFile
from pdf2image import convert_from_path, convert_from_bytes

def clean(text):
    filtered = re.sub(r'\n\s*\n', '\n', text)
    individual_lines = re.split('\n', filtered)
    filtered_lines = []
    for line in individual_lines:
        no_mistakes_found = True
        found_single_letter = re.findall('^[a-zA-Z]$', line)
        if found_single_letter:
            no_mistakes_found = False

        found_underscore = re.findall('^_', line)
        if found_underscore:
            no_mistakes_found = False

        line = re.sub('^\W\s', '', line)
        line = re.sub('^\s', '', line)
        line = re.sub('^[b-zB-Z0-9]\s', '',line)
        
        if no_mistakes_found:
            filtered_lines.append(line)

    complete_string = ''
    for filter_line in filtered_lines:
        complete_string += filter_line
        complete_string += "\n"

    return complete_string

def createWhitelist():
    letters = string.ascii_letters
    numbers = string.digits
    whitespace = string.whitespace
    punctuation = ',.!@?&8+-_%$#()*`~'':;''"/'
    return (letters + numbers + punctuation + whitespace)


def createBlacklist():
    return ''


# enlarges the document image
def resize(image):
    return cv2.pyrUp(image)


# converts a colour image to a black and white image
def get_grayscale(image):
    return cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)


# converts a image to a binary image
def thresholding(image):
    # thresholds the image into a binary image (black and white)
    return cv2.threshold(image, 150, 255, cv2.THRESH_BINARY)[1]


# converts a image to a binary image, inverted black becomes white, white becomes black
def invthresholding(image):
    return cv2.threshold(image, 150, 255, cv2.THRESH_BINARY_INV)[1]


# uses Canny Edge Detection to detect the edges in an image
def cannyEdge(image):
    return cv2.Canny(image, 10, 200)


# blurs the image and uses the blurred image to remove any large black areas
def medianBlurDiff(
        image):  # src = https://stackoverflow.com/questions/5442910/how-to-use-multiprocessing-pool-map-with-multiple-arguments
    bg = cv2.medianBlur(image, 151)  # suitably large kernel to cover all text
    cv2.imwrite('blur.jpeg', bg)
    return (255 - cv2.absdiff(bg, image))


# uses a sample to detect in which language the document is written, currently limited to Dutch, Polish and English
def detectLanguage(images):
    if images[0] is not Image or PpmImageFile:
        img = Image.fromarray(images[0])
    else:
        img = images[0]

    sample_text = tesserocr.image_to_text(img)
    lang = langdetect.detect(sample_text)
    finalLang = ''
    if lang == 'en':
        finalLang = 'eng'
    elif lang == 'nl':
        finalLang = 'nld'
    elif lang == 'pl':
        finalLang = 'pol'
    print('The current document has language code: ' + finalLang)
    return finalLang


# Attempts to prepare the image without preprocessing
def noPreproccessingAttempt(images):
    return images


# Attempts to prepare the image by resizing and grayscaling
def onlyGrayscaleResizeAttempt(images):
    imgs = []
    for i in images:
        resized = resize(i)
        gray = get_grayscale(resized)
        imgs.append(gray)
    return imgs


# Attempts to prepare the image by resizing, grayscaling and thresholding
def grayscalingThresholdResizeAttempt(images):
    imgs = []
    for i in images:
        resized = resize(i)
        gray = get_grayscale(resized)
        cv2.imwrite('gray.jpeg', gray)
        thresh = medianBlurDiff(gray)
        cv2.imwrite('thresh.jpeg', thresh)
        imgs.append(thresh)

    return imgs


# Attempts to prepare the image by resizing, grayscaling and inverted thresholding
def grayscalingInvertedThresholdResizeAttempt(images):
    imgs = []
    for i in images:
        resized = resize(i)
        gray = get_grayscale(resized)
        invthresh = invthresholding(gray)
        imgs.append(invthresh)

    return imgs


# Attempts to prepare the image by resizing, grayscaling, thresholding and canny edge detection
def grayscalingThresholdResizeEdgedAttempt(images):
    imgs = []
    for i in images:
        resized = resize(i)
        gray = get_grayscale(resized)
        thresh = medianBlurDiff(gray)
        edge = cannyEdge(thresh)
        imgs.append(edge)

    return imgs


# Attempts to prepare the image by resizing, grayscaling, inverted thresholding and canny edge detection
def grayscalingInvertedThresholdResizeEdgedAttempt(images):
    imgs = []
    for i in images:
        resized = resize(i)
        gray = get_grayscale(resized)
        edge = cannyEdge(gray)
        thresh = invthresholding(edge)
        imgs.append(thresh)

    return imgs


# checks if the given image is an image file or an PDF
# if the image is a PDF, it checks if it is a searchable PDF
# if the PDF is searchable, the text is extracted
# if not, the PDF is converted to images and read out by the OCR
def pdfOrImg(img, filePath):  # , filePath):
    pdfText = ''
    if img is None:
        # not an image file
        with pdfplumber.open(filePath) as pdf:
            testPage = pdf.pages[0].extract_text()
            if testPage is None:
                print('[STARTING OCR SCRIPT - NON-SEARCHABLE PDF] - ' + str(time.strftime("%H:%M:%S")))
                # PDF is a scanned image or something similar and needs to go to the OCR
                print('File is a .pdf, BUT is a non-searchable PDF, printing WITH OCR')
                images = []
                pages = convert_from_path(filePath)
                for idx, page in enumerate(pages):
                    with io.BytesIO() as f:
                        page.save(f, format="jpeg")
                        f.seek(0)
                        bytes = np.asarray(bytearray(f.read()), dtype=np.uint8)
                        images.append(cv2.imdecode(bytes, cv2.IMREAD_COLOR))
                return images, pdfText
            else:
                print('[STARTING OCR SCRIPT - SEARCHABLE PDF] - ' + str(time.strftime("%H:%M:%S")))
                print('File is a .pdf, printing without OCR')
                pageCount = len(pdf.pages)
                for x in range(pageCount):
                    page = pdf.pages[x].extract_text()
                    pdfText += page
                print('[ENDING OCR SCRIPT - SEARCHABLE PDF] - ' + str(time.strftime("%H:%M:%S")))
                return None, pdfText
    else:
        return [img], pdfText


# writes the result to a .txt file
def writeFile(filePath, text):
    name = os.path.basename(filePath)

    filename = name + '.txt'

    file = open(filename, 'w', encoding="utf-8")
    file.write(text)
    file.close()


# read out all the preprocessed images using OCR
def read(language, images):
    with tesserocr.PyTessBaseAPI(lang=language) as api:
        print('start read')
        whitelist = createWhitelist()
        blacklist = createBlacklist()
        totalText = []
        totalConfidence = []
        for image in images:
            img = Image.fromarray(image)
            api.SetPageSegMode(tesserocr.PSM.AUTO)
            api.SetVariable('tessedit_char_whitelist', whitelist)
            api.SetVariable('tessedit_char_blacklist', blacklist)
            api.SetImage(img)
            api.Recognize()
            totalText.append(api.GetUTF8Text())
            totalConfidence.append(api.MeanTextConf())
        print(np.mean(totalConfidence))
        print('end read')
        return ' '.join(totalText), np.mean(totalConfidence)


def process(images):
    language = detectLanguage(images)
    onlyGrayscaleResizeImages = onlyGrayscaleResizeAttempt(images)
    grayscalingThresholdResizeImages = grayscalingThresholdResizeAttempt(images)
    imgs = [
        onlyGrayscaleResizeImages,
        grayscalingThresholdResizeImages,
    ]
    text = []
    confidence = []
    # uses the ProcessPoolExecutor to enable Multiprocessing to speedup the application
    with ProcessPoolExecutor(max_workers=6) as executor:
        part = partial(read, language)
        future = zip(imgs, executor.map(part, imgs))
        executor.shutdown(True)
        for r in future:
            result = r[1]
            text.append(result[0])
            confidence.append(result[1])
    # calculates the highest confidence rating and selects that text as the result
    highestConfidence = np.amax(confidence)
    print('the highest confidence = {}'.format(highestConfidence))
    max_confidence_index = np.where(confidence == np.amax(confidence))[0]
    index = max_confidence_index[0]
    finalText = text[index]
    return finalText


def runOCR(filePath):
    file = cv2.imread(filePath)
    images, pdfText = pdfOrImg(file, filePath)

    if (pdfText == '' and images is not None):
        finalText = process(images)
        # writeFile(filePath, finalText)
        finalText = clean(finalText)
        return finalText
    else:
        print('Image was a searchable PDF, printed without OCR')
        pdfText = clean(pdfText)
        return pdfText

right now it just reads out everything. but I just wonder if it would finish reading one area, it would just be able to idk be able to recognize that so I could put an enter there for example or another character. or hell draw boxes around said areas

Grouping read out data using TesserOCR

0 Answers0