1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106import fitz # PyMuPDF
import pytesseract
from PIL import Image
import io
import re
import base64
import json
import os
# Precompile regex once
FINANCIAL_PATTERN = re.compile(
r"statement of unaudited (standalone|consolidated) financial results",
re.IGNORECASE
)
def classify_page(text, blocks, text_threshold=50, box_threshold=20, short_block_ratio=0.5):
text = text.strip()
num_blocks = len(blocks)
# 1. If no text or very little text -> vision
if len(text) < text_threshold:
return "vision"
# 2. If too many blocks (fragmented layout)
if num_blocks > box_threshold:
return "vision"
# 3. If average words per block are small -> likely slide with bullets
words_per_block = [len(b[4].split()) for b in blocks if b[4].strip()]
if words_per_block and (sum(w <= 5 for w in words_per_block) / len(words_per_block)) > short_block_ratio:
return "vision"
# 4. Keyword routing
keywords = ["highlights", "at a glance", "overview", "summary",
"performance", "revenue", "ebitda", "profit", "break-up"]
if any(re.search(rf"\b{kw}\b", text.lower()) for kw in keywords):
return "vision"
return "text"
def extract_text_page(page):
"""Use text extraction only."""
return page.get_text("text").strip()
def extract_vision_page(page):
"""Convert to compressed image for Vision/OCR."""
pix = page.get_pixmap(matrix=fitz.Matrix(1.5, 1.5), colorspace=fitz.csGRAY) # grayscale, smaller DPI
img = Image.open(io.BytesIO(pix.tobytes("png")))
# Compress to JPEG
buf = io.BytesIO()
img.save(buf, format="JPEG", quality=60, optimize=True)
img_b64 = base64.b64encode(buf.getvalue()).decode("utf-8")
# (Optional OCR fallback if vision unavailable)
text = pytesseract.image_to_string(img)
return {
"text": text.strip(),
"image_base64": img_b64
}
def process_pdf(file_path):
results = []
doc = fitz.open(file_path)
for i, page in enumerate(doc, start=1):
# Extract once
text = page.get_text("text")
blocks = page.get_text("blocks")
mode = classify_page(text, blocks)
if mode == "text":
content = text.strip()
else:
content = extract_vision_page(page)
results.append({
"page": i,
"mode": mode,
"content": content
})
# Check regex match (skip full scan if not needed)
if mode == "vision" and FINANCIAL_PATTERN.search(content["text"].lower()):
img_b64 = content["image_base64"]
img_bytes = base64.b64decode(img_b64)
with open(f"page_{i}.jpg", "wb") as img_file: # save compressed JPEG
img_file.write(img_bytes)
return results
if __name__ == "__main__":
pdf_path = "./docs/IPL-financial-results.pdf"
data = process_pdf(pdf_path)
with open("pages.json", "w") as f:
json.dump(data, f, indent=2)
for page_data in data:
print(f"Page {page_data['page']} ({page_data['mode']})...")