improving OCR
This commit is contained in:
parent
1b69fd4305
commit
2c2adcfa7c
2 changed files with 28 additions and 0 deletions
13
crawler/91_recalculate_floorplan.py
Normal file
13
crawler/91_recalculate_floorplan.py
Normal file
|
|
@ -0,0 +1,13 @@
|
||||||
|
from data_access import Listing
|
||||||
|
from tqdm import tqdm
|
||||||
|
|
||||||
|
listings = Listing.get_all_listings()
|
||||||
|
recalculate_listings = []
|
||||||
|
|
||||||
|
for listing in listings:
|
||||||
|
sqm = listing.sqm_ocr
|
||||||
|
if sqm is None or sqm < 10 or sqm > 200:
|
||||||
|
recalculate_listings.append(listing)
|
||||||
|
|
||||||
|
for listing in tqdm(recalculate_listings):
|
||||||
|
listing.calculate_sqm_ocr(recalculate=True)
|
||||||
|
|
@ -1,5 +1,7 @@
|
||||||
import re
|
import re
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
|
import cv2
|
||||||
|
import numpy as np
|
||||||
from transformers import Pix2StructProcessor, Pix2StructForConditionalGeneration
|
from transformers import Pix2StructProcessor, Pix2StructForConditionalGeneration
|
||||||
import pytesseract
|
import pytesseract
|
||||||
|
|
||||||
|
|
@ -32,9 +34,22 @@ def calculate_model(image_path):
|
||||||
estimated_sqm = extract_total_sqm(output)
|
estimated_sqm = extract_total_sqm(output)
|
||||||
return estimated_sqm, output, predictions_tensor
|
return estimated_sqm, output, predictions_tensor
|
||||||
|
|
||||||
|
def improve_img_for_ocr(img: Image):
|
||||||
|
img2 = np.array(img.convert('L'))
|
||||||
|
cv2.resize(img2, None, fx=1.2, fy=1.2, interpolation=cv2.INTER_CUBIC)
|
||||||
|
thresh = cv2.adaptiveThreshold(img2,255,cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY,11,2)
|
||||||
|
return Image.fromarray(thresh)
|
||||||
|
|
||||||
def calculate_ocr(image_path):
|
def calculate_ocr(image_path):
|
||||||
img = Image.open(image_path)
|
img = Image.open(image_path)
|
||||||
text = pytesseract.image_to_string(img)
|
text = pytesseract.image_to_string(img)
|
||||||
estimated_sqm = extract_total_sqm(text)
|
estimated_sqm = extract_total_sqm(text)
|
||||||
|
if estimated_sqm is None:
|
||||||
|
improved_img = improve_img_for_ocr(img)
|
||||||
|
text2 = pytesseract.image_to_string(improved_img)
|
||||||
|
estimated_sqm2 = extract_total_sqm(text2)
|
||||||
|
with open("recalculating.log", "a") as f:
|
||||||
|
f.write(f"before: {estimated_sqm} after: {estimated_sqm2} - {image_path}\n")
|
||||||
|
return estimated_sqm2, text2
|
||||||
|
|
||||||
return estimated_sqm, text
|
return estimated_sqm, text
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue