import fitz
PDF_PATH = "test_datasets/pdfs/mixed/5PFVA6CO2FP66IJYJJ4YMWOLK5EHRCCD.pdf"
def analyze_content_stream(pdf_path):
doc = fitz.open(pdf_path)
page = doc[0]
print("=" * 80)
print("RAW CONTENT STREAM ANALYSIS")
print("=" * 80)
blocks = page.get_text("dict")
print(f"\nFound {len(blocks['blocks'])} blocks")
for block_idx, block in enumerate(blocks["blocks"][:3]): if block["type"] == 0: print(f"\n{'=' * 80}")
print(f"BLOCK {block_idx}")
print(f"{'=' * 80}")
print(f"Bbox: {block['bbox']}")
for line_idx, line in enumerate(block["lines"][:5]): print(f"\n LINE {line_idx}:")
print(f" Bbox: {line['bbox']}")
print(f" Direction: {line['dir']}")
for span_idx, span in enumerate(line["spans"][:10]): print(f"\n SPAN {span_idx}:")
print(f" Text: '{span['text']}'")
print(f" Bbox: {span['bbox']}")
print(f" Font: {span['font']}")
print(f" Size: {span['size']}")
print(f" Origin: {span['origin']}")
bbox = span["bbox"]
span_width = bbox[2] - bbox[0]
char_count = len(span["text"])
if char_count > 0:
avg_char_width = span_width / char_count
print(f" Span width: {span_width:.2f}pt")
print(f" Chars: {char_count}")
print(f" Avg char width: {avg_char_width:.2f}pt")
if span_idx < len(line["spans"]) - 1:
next_span = line["spans"][span_idx + 1]
gap = next_span["bbox"][0] - span["bbox"][2]
print(f" Gap to next: {gap:.2f}pt")
if char_count > 0:
gap_ratio = gap / avg_char_width
print(f" Gap ratio: {gap_ratio:.2f}x char width")
if gap < avg_char_width * 0.5:
print(" → Should MERGE (gap < 0.5x char width)")
else:
print(" → Should INSERT SPACE (gap >= 0.5x char width)")
doc.close()
def analyze_with_rawdict(pdf_path):
doc = fitz.open(pdf_path)
page = doc[0]
print("\n" + "=" * 80)
print("RAWDICT ANALYSIS (Low-level)")
print("=" * 80)
raw = page.get_text("rawdict")
for block in raw["blocks"][:2]: if block["type"] == 0:
print(f"\nBlock bbox: {block['bbox']}")
for line in block["lines"][:3]: print(f" Line: {line['bbox']}")
spans = line["spans"]
print(f" {len(spans)} spans:")
for i, span in enumerate(spans[:15]): text = "".join(chr(c) if c < 128 else f"\\x{c:02x}" for c in span["chars"])
bbox = span["bbox"]
width = bbox[2] - bbox[0]
char_count = len(span["chars"])
print(f" [{i}] '{text}' @ x={bbox[0]:.1f} w={width:.1f} chars={char_count}")
if i < len(spans) - 1:
next_span = spans[i + 1]
gap = next_span["bbox"][0] - bbox[2]
print(f" gap={gap:.1f}pt", end="")
if char_count > 0:
avg_cw = width / char_count
print(f" ({gap / avg_cw:.2f}x)", end="")
print()
doc.close()
def main():
print(f"Analyzing: {PDF_PATH}")
print()
try:
analyze_content_stream(PDF_PATH)
print("\n\n")
analyze_with_rawdict(PDF_PATH)
print("\n" + "=" * 80)
print("SUMMARY")
print("=" * 80)
print("""
Key findings to look for:
1. Are spans single characters or multiple characters?
2. What are the actual gaps between spans in points?
3. What is the average character width?
4. What ratio of gap/char_width separates words vs characters?
If we see:
- Single-char spans with gaps of ~0.1-0.3x char width → NO SPACE
- Single-char spans with gaps of ~0.5-1.0x char width → SPACE
- Multi-char spans with gaps > 0.2x char width → SPACE
This will tell us what threshold to use!
""")
except Exception as e:
print(f"Error: {e}")
import traceback
traceback.print_exc()
if __name__ == "__main__":
main()