undoc 0.1.20

High-performance Microsoft Office document extraction to Markdown
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
"""Tests for undoc Python bindings."""

import ctypes
import io
import os
import platform
import pytest
import subprocess
import sys
import zipfile
from pathlib import Path

# Import eagerly so native-backed verification can fail hard when requested.
NATIVE_IMPORT_ERROR = None

try:
    import undoc.undoc as undoc_module
    from undoc import Undoc, UndocError, parse_file, parse_bytes, version

    LIBRARY_AVAILABLE = True
except OSError as exc:
    undoc_module = None
    Undoc = None
    UndocError = Exception
    parse_file = None
    parse_bytes = None
    version = None
    LIBRARY_AVAILABLE = False
    NATIVE_IMPORT_ERROR = exc


# Get test files directory
TEST_FILES_DIR = Path(__file__).parent.parent.parent.parent / "test-files"


def _native_library_filename() -> str:
    system = platform.system()
    if system == "Windows":
        return "undoc.dll"
    if system == "Darwin":
        return "libundoc.dylib"
    return "libundoc.so"


if os.environ.get("UNDOC_REQUIRE_NATIVE") == "1" and not LIBRARY_AVAILABLE:
    configured_path = os.environ.get("UNDOC_LIB_PATH")
    configured_suffix = (
        f" (UNDOC_LIB_PATH={configured_path})" if configured_path else ""
    )
    raise RuntimeError(
        "UNDOC_REQUIRE_NATIVE=1 but undoc native bindings failed to load"
        f"{configured_suffix}: {NATIVE_IMPORT_ERROR}"
    ) from NATIVE_IMPORT_ERROR


def create_minimal_docx_bytes(text: str = "Привет из Python") -> bytes:
    """Create a tiny DOCX fixture without relying on external test files."""
    document_xml = f"""<?xml version="1.0" encoding="UTF-8"?>
<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
  <w:body>
    <w:p>
      <w:r><w:t>{text}</w:t></w:r>
    </w:p>
  </w:body>
</w:document>"""

    buf = io.BytesIO()
    with zipfile.ZipFile(buf, "w", compression=zipfile.ZIP_STORED) as zf:
        zf.writestr(
            "[Content_Types].xml",
            """<?xml version="1.0" encoding="UTF-8"?>
<Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types">
  <Default Extension="rels" ContentType="application/vnd.openxmlformats-package.relationships+xml"/>
  <Default Extension="xml" ContentType="application/xml"/>
  <Override PartName="/word/document.xml" ContentType="application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml"/>
</Types>""",
        )
        zf.writestr(
            "_rels/.rels",
            """<?xml version="1.0" encoding="UTF-8"?>
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
  <Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" Target="word/document.xml"/>
</Relationships>""",
        )
        zf.writestr(
            "word/_rels/document.xml.rels",
            """<?xml version="1.0" encoding="UTF-8"?>
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
</Relationships>""",
        )
        zf.writestr("word/document.xml", document_xml)
    return buf.getvalue()


class TestNativeVerificationHarness:
    def test_strict_native_mode_fails_closed_on_invalid_library_path(self, tmp_path):
        bad_library = tmp_path / _native_library_filename()
        bad_library.write_text("not a real shared library", encoding="utf-8")

        env = os.environ.copy()
        env["UNDOC_REQUIRE_NATIVE"] = "1"
        env["UNDOC_LIB_PATH"] = str(bad_library)
        env["PYTHONPATH"] = str(Path(__file__).resolve().parents[1] / "src")

        result = subprocess.run(
            [
                sys.executable,
                "-m",
                "pytest",
                str(Path(__file__)),
                "-k",
                "test_version_returns_string",
                "-q",
            ],
            cwd=Path(__file__).resolve().parents[3],
            capture_output=True,
            text=True,
            env=env,
            check=False,
        )

        assert result.returncode != 0
        combined_output = result.stdout + result.stderr
        assert "UNDOC_REQUIRE_NATIVE=1" in combined_output
        assert str(bad_library) in combined_output


class FakeStringLibrary:
    """Minimal fake library for ownership/UTF-8 regression tests."""

    def __init__(self):
        self._buffers = []
        self.freed = []

    def _alloc(self, text: str) -> int:
        buf = ctypes.create_string_buffer(text.encode("utf-8"))
        self._buffers.append(buf)
        return ctypes.addressof(buf)

    def undoc_last_error(self):
        return self._alloc("Ошибка native")

    def undoc_version(self):
        return self._alloc("1.2.3")

    def undoc_free_string(self, ptr):
        self.freed.append(int(ptr))

    def undoc_free_document(self, _handle):
        return None

    def undoc_to_markdown(self, _handle, _flags):
        return self._alloc("Привет из Markdown")

    def undoc_get_title(self, _handle):
        return self._alloc("Заголовок")

    def undoc_get_author(self, _handle):
        return self._alloc("Автор")

    def undoc_get_resource_ids(self, _handle):
        return self._alloc('["rId1"]')

    def undoc_get_resource_info(self, _handle, _resource_id):
        return self._alloc('{"filename":"Пример.png"}')


class TestVersion:
    def test_version_returns_string(self):
        v = version()
        assert isinstance(v, str)
        assert len(v) > 0

    def test_version_format(self):
        v = version()
        # Should be semver-like
        parts = v.split(".")
        assert len(parts) >= 2


class TestParseFile:
    def test_parse_nonexistent_file(self):
        with pytest.raises(FileNotFoundError):
            parse_file("nonexistent.docx")

    @pytest.mark.skipif(
        not (TEST_FILES_DIR / "file-sample_1MB.docx").exists(),
        reason="Test file not available",
    )
    def test_parse_docx(self):
        doc = parse_file(TEST_FILES_DIR / "file-sample_1MB.docx")
        assert doc is not None
        assert doc.section_count >= 0

    @pytest.mark.skipif(
        not (TEST_FILES_DIR / "sample-xlsx-file.xlsx").exists(),
        reason="Test file not available",
    )
    def test_parse_xlsx(self):
        doc = parse_file(TEST_FILES_DIR / "sample-xlsx-file.xlsx")
        assert doc is not None
        assert doc.section_count >= 0

    @pytest.mark.skipif(
        not (TEST_FILES_DIR / "file_example_PPT_1MB.pptx").exists(),
        reason="Test file not available",
    )
    def test_parse_pptx(self):
        doc = parse_file(TEST_FILES_DIR / "file_example_PPT_1MB.pptx")
        assert doc is not None
        assert doc.section_count >= 0


class TestConversion:
    @pytest.fixture
    def sample_docx(self):
        path = TEST_FILES_DIR / "file-sample_1MB.docx"
        if not path.exists():
            pytest.skip("Test file not available")
        return parse_file(path)

    def test_to_markdown(self, sample_docx):
        md = sample_docx.to_markdown()
        assert isinstance(md, str)
        assert len(md) > 0

    def test_to_markdown_with_frontmatter(self, sample_docx):
        md = sample_docx.to_markdown(frontmatter=True)
        assert "---" in md

    def test_to_text(self, sample_docx):
        text = sample_docx.to_text()
        assert isinstance(text, str)
        assert len(text) > 0

    def test_to_json(self, sample_docx):
        json_str = sample_docx.to_json()
        assert isinstance(json_str, str)
        assert json_str.startswith("{")

    def test_to_json_compact(self, sample_docx):
        json_str = sample_docx.to_json(compact=True)
        assert isinstance(json_str, str)
        # Compact JSON has no indentation
        assert "\n  " not in json_str

    def test_plain_text(self, sample_docx):
        text = sample_docx.plain_text()
        assert isinstance(text, str)


class TestMetadata:
    @pytest.fixture
    def sample_docx(self):
        path = TEST_FILES_DIR / "file-sample_1MB.docx"
        if not path.exists():
            pytest.skip("Test file not available")
        return parse_file(path)

    def test_section_count(self, sample_docx):
        assert isinstance(sample_docx.section_count, int)
        assert sample_docx.section_count >= 0

    def test_resource_count(self, sample_docx):
        assert isinstance(sample_docx.resource_count, int)
        assert sample_docx.resource_count >= 0

    def test_title(self, sample_docx):
        title = sample_docx.title
        # Title may be None or string
        assert title is None or isinstance(title, str)

    def test_author(self, sample_docx):
        author = sample_docx.author
        # Author may be None or string
        assert author is None or isinstance(author, str)


class TestContextManager:
    @pytest.mark.skipif(
        not (TEST_FILES_DIR / "file-sample_1MB.docx").exists(),
        reason="Test file not available",
    )
    def test_context_manager(self):
        with parse_file(TEST_FILES_DIR / "file-sample_1MB.docx") as doc:
            md = doc.to_markdown()
            assert len(md) > 0
        # After exiting, the document should be freed
        # (we can't easily test this, but at least it shouldn't crash)


class TestParseBytes:
    @pytest.mark.skipif(
        not (TEST_FILES_DIR / "file-sample_1MB.docx").exists(),
        reason="Test file not available",
    )
    def test_parse_bytes(self):
        path = TEST_FILES_DIR / "file-sample_1MB.docx"
        with open(path, "rb") as f:
            data = f.read()

        doc = parse_bytes(data)
        assert doc is not None

        md = doc.to_markdown()
        assert len(md) > 0


class TestResources:
    @pytest.fixture
    def docx_with_images(self):
        # Try to find a document with images
        for name in ["file-sample_1MB.docx", "sample-docx-file.docx"]:
            path = TEST_FILES_DIR / name
            if path.exists():
                doc = parse_file(path)
                if doc.resource_count > 0:
                    return doc
        pytest.skip("No test file with resources available")

    def test_get_resource_ids(self, docx_with_images):
        ids = docx_with_images.get_resource_ids()
        assert isinstance(ids, list)
        assert len(ids) > 0

    def test_get_resource_info(self, docx_with_images):
        ids = docx_with_images.get_resource_ids()
        if ids:
            info = docx_with_images.get_resource_info(ids[0])
            assert info is not None
            assert "filename" in info

    def test_get_resource_data(self, docx_with_images):
        ids = docx_with_images.get_resource_ids()
        if ids:
            data = docx_with_images.get_resource_data(ids[0])
            assert data is not None
            assert len(data) > 0

    def test_get_nonexistent_resource(self, docx_with_images):
        info = docx_with_images.get_resource_info("nonexistent_id")
        assert info is None

        data = docx_with_images.get_resource_data("nonexistent_id")
        assert data is None


class TestFfiOwnershipAndUtf8:
    def test_rust_owned_strings_are_copied_and_freed(self, monkeypatch):
        fake_lib = FakeStringLibrary()
        monkeypatch.setattr(undoc_module, "get_library", lambda: fake_lib)

        doc = undoc_module.Undoc(123)
        markdown = doc.to_markdown()
        expected_ptr = ctypes.addressof(fake_lib._buffers[-1])

        assert markdown == "Привет из Markdown"
        assert fake_lib.freed == [expected_ptr]

    def test_last_error_uses_utf8_without_free(self, monkeypatch):
        fake_lib = FakeStringLibrary()
        monkeypatch.setattr(undoc_module, "get_library", lambda: fake_lib)

        assert undoc_module._get_last_error() == "Ошибка native"
        assert fake_lib.freed == []

    def test_version_uses_utf8_without_free(self, monkeypatch):
        fake_lib = FakeStringLibrary()
        monkeypatch.setattr(undoc_module, "get_library", lambda: fake_lib)

        assert undoc_module.version() == "1.2.3"
        assert fake_lib.freed == []

    def test_metadata_and_resource_json_are_copied_before_free(self, monkeypatch):
        fake_lib = FakeStringLibrary()
        monkeypatch.setattr(undoc_module, "get_library", lambda: fake_lib)

        doc = undoc_module.Undoc(123)
        title = doc.title
        author = doc.author
        resource_ids = doc.get_resource_ids()
        info = doc.get_resource_info("rId1")

        expected_freed = [ctypes.addressof(buf) for buf in fake_lib._buffers]

        assert title == "Заголовок"
        assert author == "Автор"
        assert resource_ids == ["rId1"]
        assert info == {"filename": "Пример.png"}
        assert fake_lib.freed == expected_freed

    def test_parse_bytes_generated_docx_preserves_unicode(self):
        doc = parse_bytes(create_minimal_docx_bytes())
        text = doc.to_text()

        assert "Привет из Python" in text

    def test_parse_file_generated_docx_preserves_unicode(self, tmp_path):
        path = tmp_path / "unicode.docx"
        path.write_bytes(create_minimal_docx_bytes("Привет из файла"))

        with parse_file(path) as doc:
            markdown = doc.to_markdown()

        assert "Привет из файла" in markdown