diff --git a/LICENSE b/LICENSE index c00b7d0..b96fa18 100644 --- a/LICENSE +++ b/LICENSE @@ -1,4 +1,4 @@ -Copyright (c) 2016, PDFTables.com +Copyright (c) 2026, PDFTables.com All rights reserved. Redistribution and use in source and binary forms, with or without diff --git a/README.md b/README.md index 8dfd223..24f869a 100644 --- a/README.md +++ b/README.md @@ -46,6 +46,26 @@ To convert to CSV, XML or HTML simply change `c.xlsx` to be `c.csv`, `c.xml` or To specify Excel (single sheet) or Excel (multiple sheets) use `c.xlsx_single` or `c.xlsx_multiple`. +## Extractor + +You can specify which extraction engine to use when creating a `Client`. The available extractors are `standard` (default), `ai-1`, and `ai-2`. + +For AI extractors (`ai-1` and `ai-2`), you can also specify an `extract` option to control what content is extracted: `tables` (default) or `tables-paragraphs`. + +```py +from pdftables_api import (Client, EXTRACTOR_AI_1, EXTRACTOR_AI_2, + EXTRACT_TABLES, EXTRACT_TABLES_PARAGRAPHS) + +# Standard extractor (default) +c_standard = Client('my-api-key') + +# AI extractors for complex documents +c_ai_1 = Client('my-api-key', extractor=EXTRACTOR_AI_1, extract=EXTRACT_TABLES) +c_ai_2 = Client('my-api-key', extractor=EXTRACTOR_AI_2, extract=EXTRACT_TABLES_PARAGRAPHS) +``` + +See [PDFTables API documentation](https://pdftables.com/pdf-to-excel-api#extractors) for details. + ## Test Tests run with pytest: `make test` diff --git a/pdftables_api/__init__.py b/pdftables_api/__init__.py index 8c82e3f..9a74d80 100644 --- a/pdftables_api/__init__.py +++ b/pdftables_api/__init__.py @@ -1,4 +1,4 @@ -# Copyright 2016 The Sensible Code Company +# Copyright 2026 Cantabular Ltd # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,6 +13,11 @@ # limitations under the License. from .pdftables_api import ( + EXTRACT_TABLES, + EXTRACT_TABLES_PARAGRAPHS, + EXTRACTOR_AI_1, + EXTRACTOR_AI_2, + EXTRACTOR_STANDARD, FORMAT_CSV, FORMAT_XLSX, FORMAT_XLSX_MULTIPLE, @@ -30,4 +35,9 @@ "FORMAT_XML", "APIException", "Client", + "EXTRACTOR_STANDARD", + "EXTRACTOR_AI_1", + "EXTRACTOR_AI_2", + "EXTRACT_TABLES", + "EXTRACT_TABLES_PARAGRAPHS", ] diff --git a/pdftables_api/pdftables_api.py b/pdftables_api/pdftables_api.py index 8ec8c09..91e79eb 100644 --- a/pdftables_api/pdftables_api.py +++ b/pdftables_api/pdftables_api.py @@ -1,4 +1,4 @@ -# Copyright 2016 The Sensible Code Company +# Copyright 2026 Cantabular Ltd # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -42,13 +42,71 @@ } _STRING_FORMATS = {FORMAT_CSV, FORMAT_HTML, FORMAT_XML} +EXTRACTOR_STANDARD = "standard" +EXTRACTOR_AI_1 = "ai-1" +EXTRACTOR_AI_2 = "ai-2" + +EXTRACT_TABLES = "tables" +EXTRACT_TABLES_PARAGRAPHS = "tables-paragraphs" + +# Valid extractor options for each extractor type +_VALID_EXTRACTOR_VALUES = { + EXTRACTOR_STANDARD: (), # Standard extractor has no options + EXTRACTOR_AI_1: ( + EXTRACT_TABLES, + EXTRACT_TABLES_PARAGRAPHS, + ), # Use a tuple for consistent order in error messages + EXTRACTOR_AI_2: ( + EXTRACT_TABLES, + EXTRACT_TABLES_PARAGRAPHS, + ), +} + +# Valid extractor types +_VALID_EXTRACTORS = tuple(_VALID_EXTRACTOR_VALUES.keys()) + class Client: - def __init__(self, api_key, api_url=_API_URL, timeout=_DEFAULT_TIMEOUT): + def __init__( + self, + api_key, + api_url=_API_URL, + timeout=_DEFAULT_TIMEOUT, + extractor=EXTRACTOR_STANDARD, + extract=None, + ): self.api_key = api_key self.api_url = api_url self.timeout = timeout + # Validate and set extractor configuration + self._validate_extractor(extractor, extract) + self.extractor = extractor + self.extract = extract + + @staticmethod + def _validate_extractor(extractor, extract): + """Validate extractor and extract parameters.""" + if extractor not in _VALID_EXTRACTORS: + valid_extractors = ", ".join(_VALID_EXTRACTORS) + raise ValueError( + f'Invalid extractor "{extractor}". Valid options are: {valid_extractors}' + ) + + valid_extract_values = _VALID_EXTRACTOR_VALUES[extractor] + if extract is not None and extract not in valid_extract_values: + if len(valid_extract_values) == 0: + raise ValueError( + f'Extractor "{extractor}" does not support extract parameter' + ) + else: + valid_extract_values_str = ", ".join( + str(opt) for opt in valid_extract_values + ) + raise ValueError( + f'Invalid extract value "{extract}" for extractor "{extractor}". Valid values are: {valid_extract_values_str}' + ) + def xlsx(self, pdf_path, xlsx_path=None): """ Convenience method to convert PDF to XLSX multiple sheets. @@ -147,7 +205,14 @@ def request(self, pdf_fo, out_format=None, query_params=None, **requests_params) url = self.api_url files = {"f": ("file.pdf", pdf_fo)} params = query_params if query_params else {} - params.update({"key": self.api_key, "format": out_format}) + params.update( + { + "key": self.api_key, + "format": out_format, + "extractor": self.extractor, + "extract": self.extract, + } + ) response = requests.post( url, files=files, stream=True, params=params, **requests_params diff --git a/pyproject.toml b/pyproject.toml index e612615..0c6a7af 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -10,8 +10,8 @@ description = "PDFTables.com Python API library." readme = "README.md" license = { text = "Apache License 2.0" } keywords = ["pdf", "tables", "excel", "csv", "xml", "api"] -authors = [ { name = "The Sensible Code Company", email = "support@sensiblecode.io" } ] -urls = { "Homepage" = "https://github.com/sensiblecode/python-pdftables-api" } +authors = [ { name = "Cantabular Ltd", email = "hello@pdftables.com" } ] +urls = { "Homepage" = "https://github.com/pdftables/python-pdftables-api" } dependencies = ["requests"] classifiers = [ "Development Status :: 5 - Production/Stable", diff --git a/test/test_pdftables_api.py b/test/test_pdftables_api.py index e3c10e0..8e51beb 100644 --- a/test/test_pdftables_api.py +++ b/test/test_pdftables_api.py @@ -1,4 +1,4 @@ -# Copyright 2016 The Sensible Code Company +# Copyright 2026 Cantabular Ltd # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -20,7 +20,15 @@ import pytest import requests_mock -from pdftables_api import APIException, Client +from pdftables_api import ( + EXTRACT_TABLES, + EXTRACT_TABLES_PARAGRAPHS, + EXTRACTOR_AI_1, + EXTRACTOR_AI_2, + EXTRACTOR_STANDARD, + APIException, + Client, +) class TestEnsureExtFormat(TestCase): @@ -181,6 +189,157 @@ def test_response_unknown_file_format(self): c.dump(png_fo) +class TestExtractorParameters(TestCase): + def test_default_extractor(self): + """Test that default extractor is 'standard' with no extract parameter.""" + with requests_mock.mock() as m: + m.post( + "https://pdftables.com/api?key=fake_key&format=xlsx-multiple&extractor=standard", + text="xlsx output", + ) + + c = Client("fake_key") + with NamedTemporaryFile(suffix="test.pdf") as tf: + tf.write(b"Hello world") + tf.file.close() + c.convert(tf.name) + + def test_ai1_extractor_with_no_extract(self): + """Test ai-1 extractor with no extract parameter.""" + with requests_mock.mock() as m: + m.post( + "https://pdftables.com/api?key=fake_key&format=xlsx-multiple&extractor=ai-1", + text="xlsx output", + ) + + c = Client("fake_key", extractor=EXTRACTOR_AI_1) + with NamedTemporaryFile(suffix="test.pdf") as tf: + tf.write(b"Hello world") + tf.file.close() + c.convert(tf.name) + + def test_ai1_extractor_with_tables(self): + """Test ai-1 extractor with 'tables' extract parameter.""" + with requests_mock.mock() as m: + m.post( + "https://pdftables.com/api?key=fake_key&format=xlsx-multiple&extractor=ai-1&extract=tables", + text="xlsx output", + ) + + c = Client("fake_key", extractor=EXTRACTOR_AI_1, extract=EXTRACT_TABLES) + with NamedTemporaryFile(suffix="test.pdf") as tf: + tf.write(b"Hello world") + tf.file.close() + c.convert(tf.name) + + def test_ai1_extractor_with_tables_paragraphs(self): + """Test ai-1 extractor with 'tables-paragraphs' extract parameter.""" + with requests_mock.mock() as m: + m.post( + "https://pdftables.com/api?key=fake_key&format=csv&extractor=ai-1&extract=tables-paragraphs", + text="csv output", + ) + + c = Client( + "fake_key", extractor=EXTRACTOR_AI_1, extract=EXTRACT_TABLES_PARAGRAPHS + ) + with NamedTemporaryFile(suffix="test.pdf") as tf: + tf.write(b"Hello world") + tf.file.close() + c.convert(tf.name, out_format="csv") + + def test_ai2_extractor_with_no_extract(self): + """Test ai-2 extractor with no extract parameter.""" + with requests_mock.mock() as m: + m.post( + "https://pdftables.com/api?key=fake_key&format=xlsx-multiple&extractor=ai-2", + text="xlsx output", + ) + + c = Client("fake_key", extractor=EXTRACTOR_AI_2) + with NamedTemporaryFile(suffix="test.pdf") as tf: + tf.write(b"Hello world") + tf.file.close() + c.convert(tf.name) + + def test_ai2_extractor_with_tables(self): + """Test ai-2 extractor with 'tables' extract parameter.""" + with requests_mock.mock() as m: + m.post( + "https://pdftables.com/api?key=fake_key&format=xlsx-multiple&extractor=ai-2&extract=tables", + text="xlsx output", + ) + + c = Client("fake_key", extractor=EXTRACTOR_AI_2, extract=EXTRACT_TABLES) + with NamedTemporaryFile(suffix="test.pdf") as tf: + tf.write(b"Hello world") + tf.file.close() + c.convert(tf.name) + + def test_ai2_extractor_with_tables_paragraphs(self): + """Test ai-2 extractor with 'tables-paragraphs' extract parameter.""" + with requests_mock.mock() as m: + m.post( + "https://pdftables.com/api?key=fake_key&format=csv&extractor=ai-2&extract=tables-paragraphs", + text="csv output", + ) + + c = Client( + "fake_key", extractor=EXTRACTOR_AI_2, extract=EXTRACT_TABLES_PARAGRAPHS + ) + with NamedTemporaryFile(suffix="test.pdf") as tf: + tf.write(b"Hello world") + tf.file.close() + c.convert(tf.name, out_format="csv") + + def test_standard_extractor_no_extract_param_in_url(self): + """Test that standard extractor doesn't include extract parameter in URL.""" + with requests_mock.mock() as m: + # Note: no 'extract' parameter in the URL for standard extractor + m.post( + "https://pdftables.com/api?key=fake_key&format=csv&extractor=standard", + text="csv output", + ) + + c = Client("fake_key", extractor=EXTRACTOR_STANDARD, extract=None) + with NamedTemporaryFile(suffix="test.pdf") as tf: + tf.write(b"Hello world") + tf.file.close() + c.convert(tf.name, out_format="csv") + + def test_invalid_extractor_raises_error(self): + """Test that invalid extractor raises ValueError.""" + with pytest.raises( + ValueError, + match='^Invalid extractor "invalid". Valid options are: standard, ai-1, ai-2$', + ): + Client("fake_key", extractor="invalid") + + def test_invalid_extract_for_standard_raises_error(self): + """Test that providing extract parameter for standard extractor raises ValueError.""" + with pytest.raises( + ValueError, + match='^Extractor "standard" does not support extract parameter$', + ): + Client("fake_key", extractor=EXTRACTOR_STANDARD, extract=EXTRACT_TABLES) + + def test_invalid_extract_for_ai_raises_error(self): + """Test that invalid extract value for AI extractor raises ValueError.""" + with pytest.raises( + ValueError, + match='^Invalid extract value "invalid" for extractor "ai-1". Valid values are: tables, tables-paragraphs$', + ): + Client("fake_key", extractor=EXTRACTOR_AI_1, extract="invalid") + + def test_invalid_extract_for_ai2_raises_error(self): + """Test that invalid extract value for AI-2 extractor raises ValueError.""" + with pytest.raises( + ValueError, + match='^Invalid extract value "invalid" for extractor "ai-2". Valid values are: tables, tables-paragraphs$', + ): + Client("fake_key", extractor=EXTRACTOR_AI_2, extract="invalid") + + def consume(s): r = b"" for chunk in s: