improving OCR

This commit is contained in:
Kadir 2025-03-30 23:41:52 +01:00
parent 1b69fd4305
commit 2c2adcfa7c
2 changed files with 28 additions and 0 deletions

View file

@ -0,0 +1,13 @@
from data_access import Listing
from tqdm import tqdm
listings = Listing.get_all_listings()
recalculate_listings = []
for listing in listings:
sqm = listing.sqm_ocr
if sqm is None or sqm < 10 or sqm > 200:
recalculate_listings.append(listing)
for listing in tqdm(recalculate_listings):
listing.calculate_sqm_ocr(recalculate=True)

View file

@ -1,5 +1,7 @@
import re import re
from PIL import Image from PIL import Image
import cv2
import numpy as np
from transformers import Pix2StructProcessor, Pix2StructForConditionalGeneration from transformers import Pix2StructProcessor, Pix2StructForConditionalGeneration
import pytesseract import pytesseract
@ -32,9 +34,22 @@ def calculate_model(image_path):
estimated_sqm = extract_total_sqm(output) estimated_sqm = extract_total_sqm(output)
return estimated_sqm, output, predictions_tensor return estimated_sqm, output, predictions_tensor
def improve_img_for_ocr(img: Image):
img2 = np.array(img.convert('L'))
cv2.resize(img2, None, fx=1.2, fy=1.2, interpolation=cv2.INTER_CUBIC)
thresh = cv2.adaptiveThreshold(img2,255,cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY,11,2)
return Image.fromarray(thresh)
def calculate_ocr(image_path): def calculate_ocr(image_path):
img = Image.open(image_path) img = Image.open(image_path)
text = pytesseract.image_to_string(img) text = pytesseract.image_to_string(img)
estimated_sqm = extract_total_sqm(text) estimated_sqm = extract_total_sqm(text)
if estimated_sqm is None:
improved_img = improve_img_for_ocr(img)
text2 = pytesseract.image_to_string(improved_img)
estimated_sqm2 = extract_total_sqm(text2)
with open("recalculating.log", "a") as f:
f.write(f"before: {estimated_sqm} after: {estimated_sqm2} - {image_path}\n")
return estimated_sqm2, text2
return estimated_sqm, text return estimated_sqm, text