📦 itsmadhusudhan / market-analysis

📄 pdf_classifier.py · 106 lines
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106import fitz  # PyMuPDF
import pytesseract
from PIL import Image
import io
import re
import base64
import json
import os

# Precompile regex once
FINANCIAL_PATTERN = re.compile(
    r"statement of unaudited (standalone|consolidated) financial results",
    re.IGNORECASE
)

def classify_page(text, blocks, text_threshold=50, box_threshold=20, short_block_ratio=0.5):
    text = text.strip()
    num_blocks = len(blocks)

    # 1. If no text or very little text -> vision
    if len(text) < text_threshold:
        return "vision"

    # 2. If too many blocks (fragmented layout)
    if num_blocks > box_threshold:
        return "vision"

    # 3. If average words per block are small -> likely slide with bullets
    words_per_block = [len(b[4].split()) for b in blocks if b[4].strip()]
    if words_per_block and (sum(w <= 5 for w in words_per_block) / len(words_per_block)) > short_block_ratio:
        return "vision"

    # 4. Keyword routing
    keywords = ["highlights", "at a glance", "overview", "summary",
                "performance", "revenue", "ebitda", "profit", "break-up"]
    if any(re.search(rf"\b{kw}\b", text.lower()) for kw in keywords):
        return "vision"

    return "text"


def extract_text_page(page):
    """Use text extraction only."""
    return page.get_text("text").strip()


def extract_vision_page(page):
    """Convert to compressed image for Vision/OCR."""
    pix = page.get_pixmap(matrix=fitz.Matrix(1.5, 1.5), colorspace=fitz.csGRAY)  # grayscale, smaller DPI
    img = Image.open(io.BytesIO(pix.tobytes("png")))

    # Compress to JPEG
    buf = io.BytesIO()
    img.save(buf, format="JPEG", quality=60, optimize=True)
    img_b64 = base64.b64encode(buf.getvalue()).decode("utf-8")

    # (Optional OCR fallback if vision unavailable)
    text = pytesseract.image_to_string(img)

    return {
        "text": text.strip(),
        "image_base64": img_b64
    }


def process_pdf(file_path):
    results = []
    doc = fitz.open(file_path)

    for i, page in enumerate(doc, start=1):
        # Extract once
        text = page.get_text("text")
        blocks = page.get_text("blocks")

        mode = classify_page(text, blocks)
        if mode == "text":
            content = text.strip()
        else:
            content = extract_vision_page(page)

        results.append({
            "page": i,
            "mode": mode,
            "content": content
        })

        # Check regex match (skip full scan if not needed)
        if mode == "vision" and FINANCIAL_PATTERN.search(content["text"].lower()):
            img_b64 = content["image_base64"]
            img_bytes = base64.b64decode(img_b64)
            with open(f"page_{i}.jpg", "wb") as img_file:  # save compressed JPEG
                img_file.write(img_bytes)

    return results


if __name__ == "__main__":
    pdf_path = "./docs/IPL-financial-results.pdf"
    data = process_pdf(pdf_path)

    with open("pages.json", "w") as f:
        json.dump(data, f, indent=2)

    for page_data in data:
        print(f"Page {page_data['page']} ({page_data['mode']})...")