diff --git a/.atoum.php b/.atoum.php deleted file mode 100644 index c4fce602..00000000 --- a/.atoum.php +++ /dev/null @@ -1,60 +0,0 @@ -addDefaultReport(); - -/* -LOGO - -// This will add the atoum logo before each run. -$report->addField(new atoum\report\fields\runner\atoum\logo()); - -// This will add a green or red logo after each run depending on its status. -$report->addField(new atoum\report\fields\runner\result\logo()); -*/ - -/* -CODE COVERAGE SETUP -*/ -// Please replace in next line "Project Name" by your project name and "/path/to/destination/directory" by your destination directory path for html files. -$coverageField = new atoum\report\fields\runner\coverage\html('PdfParser', 'coverage'); - -// Please replace in next line http://url/of/web/site by the root url of your code coverage web site. -$coverageField->setRootUrl('http://test.local'); - -$report->addField($coverageField); -/**/ - -/* -TEST GENERATOR SETUP - -$testGenerator = new atoum\test\generator(); - -// Please replace in next line "/path/to/your/tests/units/classes/directory" by your unit test's directory. -$testGenerator->setTestClassesDirectory('path/to/your/tests/units/classes/directory'); - -// Please replace in next line "your\project\namespace\tests\units" by your unit test's namespace. -$testGenerator->setTestClassNamespace('your\project\namespace\tests\units'); - -// Please replace in next line "/path/to/your/classes/directory" by your classes directory. -$testGenerator->setTestedClassesDirectory('path/to/your/classes/directory'); - -// Please replace in next line "your\project\namespace" by your project namespace. -$testGenerator->setTestedClassNamespace('your\project\namespace'); - -// Please replace in next line "path/to/your/tests/units/runner.php" by path to your unit test's runner. -$testGenerator->setRunnerPath('path/to/your/tests/units/runner.php'); - -$script->getRunner()->setTestGenerator($testGenerator); -*/ diff --git a/.bootstrap.atoum.php b/.bootstrap.atoum.php deleted file mode 100644 index 174bd261..00000000 --- a/.bootstrap.atoum.php +++ /dev/null @@ -1,17 +0,0 @@ - + + + - PHP Version: + - PDFParser Version: + +### Description: + +### PDF input + + +### Expected output & actual output + + +### Code + diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md new file mode 100644 index 00000000..6e462ec6 --- /dev/null +++ b/.github/pull_request_template.md @@ -0,0 +1,14 @@ +# Type of pull request + +* [ ] Bug fix (involves code and configuration changes) +* [ ] New feature (involves code and configuration changes) +* [ ] Documentation update +* [ ] Something else + +# About + + + +# Checklist for code / configuration changes + +See [CONTRIBUTING.md](./../CONTRIBUTING.md) for all essential information about contributing. diff --git a/.github/workflows/coding-standards.yml b/.github/workflows/coding-standards.yml new file mode 100644 index 00000000..89f1273b --- /dev/null +++ b/.github/workflows/coding-standards.yml @@ -0,0 +1,37 @@ +name: "CS" + +on: + pull_request: + push: + branches: + - master + +jobs: + coding-standards: + name: "CS Fixer & PHPStan" + runs-on: ubuntu-latest + steps: + - + name: Checkout + uses: actions/checkout@v3 + - + env: + COMPOSER_TOKEN: "${{ secrets.GITHUB_TOKEN }}" + name: "Install PHP" + uses: shivammathur/setup-php@v2 + with: + coverage: none + php-version: "7.4" + tools: "composer:v2" + - + name: Install dependencies with Composer + uses: ramsey/composer-install@v2 + - + name: Install dev tools + run: make install-dev-tools + - + name: Run PHP CS Fixer + run: 'make run-php-cs-fixer ARGS="--verbose --dry-run"' + - + name: Run PHPStan + run: 'make run-phpstan ARGS="--no-progress"' diff --git a/.github/workflows/continuous-integration.yml b/.github/workflows/continuous-integration.yml new file mode 100644 index 00000000..daadf93f --- /dev/null +++ b/.github/workflows/continuous-integration.yml @@ -0,0 +1,204 @@ +name: "CI" + +on: [push, pull_request] + +jobs: + phpunit: + name: "PHPUnit (PHP ${{ matrix.php }})" + runs-on: ubuntu-latest + + strategy: + matrix: + php: ['7.1', '7.2', '7.3', '7.4', '8.0', '8.1', '8.2', '8.3', '8.4', '8.5'] + + steps: + - name: "Checkout" + uses: "actions/checkout@v3" + + - name: "Install PHP" + uses: "shivammathur/setup-php@v2" + with: + php-version: "${{ matrix.php }}" + coverage: "none" + tools: composer:v2 + env: + COMPOSER_TOKEN: ${{ secrets.GITHUB_TOKEN }} + + - name: "Install dependencies with Composer" + uses: "ramsey/composer-install@v2" + + - name: "Install dev tools" + run: "make install-dev-tools" + + - name: "Run PHPUnit" + run: "make run-phpunit" + + phpunit-lower-php: + name: "PHPUnit (PHP ${{ matrix.php }})" + runs-on: ubuntu-latest + + strategy: + matrix: + php: ['7.1'] + + steps: + - name: "Checkout" + uses: "actions/checkout@v3" + + - name: "Install PHP" + uses: "shivammathur/setup-php@v2" + with: + php-version: "${{ matrix.php }}" + coverage: "none" + tools: composer:v2 + env: + COMPOSER_TOKEN: ${{ secrets.GITHUB_TOKEN }} + + - name: "Install dependencies with Composer" + uses: "ramsey/composer-install@v2" + + - name: "Install dev tools" + run: "make install-dev-tools" + + - name: "Run PHPUnit" + run: "make run-phpunit" + + phpunit-coverage: + name: "PHPUnit coverage (PHP ${{ matrix.php }})" + runs-on: ubuntu-latest + + strategy: + matrix: + php: ['7.4'] + + steps: + - name: "Checkout" + uses: "actions/checkout@v3" + + - name: "Install PHP with PCOV" + uses: "shivammathur/setup-php@v2" + with: + php-version: "${{ matrix.php }}" + coverage: "xdebug" + tools: composer:v2 + env: + COMPOSER_TOKEN: ${{ secrets.GITHUB_TOKEN }} + + - name: "Install dependencies with Composer" + uses: "ramsey/composer-install@v2" + + - name: "Install dev tools" + run: "make install-dev-tools" + + - name: "Run PHPUnit" + run: make run-phpunit ARGS="-v --coverage-clover coverage/clover.xml" + + phpunit-composerv2: + name: "PHPUnit Composer v2 (PHP ${{ matrix.php }})" + runs-on: ubuntu-latest + + strategy: + matrix: + php: ['7.4'] + + steps: + - name: "Checkout" + uses: "actions/checkout@v3" + + - name: "Install PHP" + uses: "shivammathur/setup-php@v2" + with: + php-version: "${{ matrix.php }}" + coverage: "none" + tools: composer:v2 + env: + COMPOSER_TOKEN: ${{ secrets.GITHUB_TOKEN }} + + - name: "Install dependencies with Composer" + uses: "ramsey/composer-install@v2" + + - name: "Install dev tools" + run: "make install-dev-tools" + + - name: "Run PHPUnit" + run: "make run-phpunit" + + alt-autoload: + name: "Tests alternative autoloader (PHP ${{ matrix.php }})" + runs-on: ubuntu-latest + + strategy: + matrix: + php: ['7.1', '7.2', '7.3', '7.4', '8.0', '8.1', '8.2', '8.3', '8.4', '8.5'] + + steps: + - name: "Checkout" + uses: "actions/checkout@v3" + + - name: "Install PHP" + uses: "shivammathur/setup-php@v2" + with: + php-version: "${{ matrix.php }}" + coverage: "none" + + - name: "Test alt-autoload" + run: "php tests/AltAutoloading/AltAutoloadCheck.php" + + phpunit-lowest: + name: "PHPUnit lowest deps (PHP ${{ matrix.php }})" + runs-on: ubuntu-latest + + strategy: + matrix: + php: ['7.3'] + steps: + - name: "Checkout" + uses: "actions/checkout@v3" + + - name: "Install PHP" + uses: "shivammathur/setup-php@v2" + with: + php-version: "${{ matrix.php }}" + coverage: "none" + tools: composer:v2 + env: + COMPOSER_TOKEN: ${{ secrets.GITHUB_TOKEN }} + + - name: "Install dependencies with Composer" + uses: "ramsey/composer-install@v2" + with: + dependency-versions: "lowest" + + - name: "Install dev tools" + run: "make install-dev-tools" + + - name: "Run PHPUnit" + run: "make run-phpunit" + + windows-tests: + name: Windows-Tests with PHP ${{ matrix.php }} + runs-on: windows-latest + + strategy: + fail-fast: false + matrix: + php: ['7.1', '7.2', '7.3', '7.4', '8.0', '8.1', '8.2', '8.3', '8.4', '8.5'] + + steps: + - name: Checkout + uses: actions/checkout@v3 + + - name: Install PHP + uses: shivammathur/setup-php@v2 + with: + php-version: ${{ matrix.php }} + ini-values: memory_limit=1G + + - name: Install Composer dependencies (root) + run: composer update --no-progress --no-suggest --prefer-dist --optimize-autoloader + + - name: Install Composer dependencies (dev-tools) + run: composer update --working-dir=dev-tools + + - name: Tests + run: dev-tools/vendor/bin/phpunit -c phpunit-windows.xml --exclude-group linux-only diff --git a/.github/workflows/performance.yml b/.github/workflows/performance.yml new file mode 100644 index 00000000..eb0c74aa --- /dev/null +++ b/.github/workflows/performance.yml @@ -0,0 +1,29 @@ +name: "Performance Tests" + +on: + pull_request: + push: + branches: + - "master" + +env: + fail-fast: true + +jobs: + performance-tests: + name: "Tests for the performance testing the PDF parsing" + runs-on: ubuntu-latest + + strategy: + matrix: + php: ['7.4'] + + steps: + - name: "Checkout" + uses: "actions/checkout@v3" + + - name: "Run composer for further autoloading" + run: "composer update" + + - name: "Run performance tests" + run: "php tests/Performance/runPerformanceTests.php" diff --git a/.gitignore b/.gitignore index ef332fbe..841b7199 100644 --- a/.gitignore +++ b/.gitignore @@ -1,10 +1,12 @@ /.idea/* /coverage/* -/samples/* /vendor/* -/test* /xdebug/* /composer.phar +/dev-tools/vendor /composer debug* composer.lock +/.php-cs-fixer.cache +/.phpunit.cache +/.phpunit.result.cache diff --git a/.php-cs-fixer.php b/.php-cs-fixer.php new file mode 100644 index 00000000..683637a2 --- /dev/null +++ b/.php-cs-fixer.php @@ -0,0 +1,31 @@ +in([ + __DIR__.'/src', + __DIR__.'/tests', + ]) + ->name('*.php') +; + +$config = new Config(); +$config + ->setFinder($finder) + ->setRules([ + '@PSR12' => true, + 'array_syntax' => ['syntax' => 'short'], + 'no_empty_phpdoc' => true, + 'no_unused_imports' => true, + 'no_superfluous_phpdoc_tags' => true, + 'ordered_imports' => true, + 'phpdoc_summary' => false, + 'protected_to_private' => false, + 'get_class_to_class_keyword' => false, // override for PHP < 8.0 (because ::class usage is not allowed there) + 'modernize_strpos' => false, // override for PHP < 8.0 (because str_contains not available in PHP 7.x) + ]) +; + +return $config; diff --git a/.scrutinizer.yml b/.scrutinizer.yml new file mode 100644 index 00000000..60f07edf --- /dev/null +++ b/.scrutinizer.yml @@ -0,0 +1,21 @@ +build: + cache: + directories: + - vendor # Cache for already installed composer package -> speed up composer install + nodes: + analysis: + environment: + php: + version: 8.2 + ini: + memory_limit: "-1" + variables: + XDEBUG_MODE: 'coverage' + tests: + override: + - php-scrutinizer-run + - + command: make install-dev-tools && make run-phpunit ARGS="--migrate-configuration" && make run-phpunit ARGS="--exclude-group memory-heavy --coverage-clover coverage/clover.xml" + coverage: + file: coverage/clover.xml + format: clover diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index a4621952..00000000 --- a/.travis.yml +++ /dev/null @@ -1,8 +0,0 @@ -language: php -php: - - 5.3 - - 5.4 - - 5.5 -before_script: - - composer update -script: ./vendor/bin/atoum -d src/Smalot/PdfParser/Tests/ -ncc diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 00000000..92e25243 --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,19 @@ +# Contributing + +**Please read the following text before creating a pull request.** + +This project is organized and supported by contributions from the community. Maintenance is done in our limited time. +We welcome any pull request that contributes to PDFParser (code, documentation, ...). +However, we would like to point out that you are initially responsible for a contribution. +If you are new to dealing with pull requests, you can find more information at [Github documentation](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/proposing-changes-to-your-work-with-pull-requests/about-pull-requests). +Please don't just throw code at us and expect us to handle it. +Nevertheless, we will support you and give you feedback. + +To make life easier for you and us, there is a Continuous Integration (CI) system that carries out software tests and performs a number of other tasks. +The following points describe the relevant preparations/inputs for the CI system. +All checks must be green, otherwise a pull request will not be accepted. +* Please create an [issue](https://github.com/smalot/pdfparser/issues) before starting work on any significant changes. +* We only accept code that is bundled with tests, regardless of whether it is a new function or a bug fix. This strengthens the code base and avoids later regressions. :exclamation: **If you don't know how to write a test, tell us upfront when you open the pull request and we might add them ourselves or discuss other ways**. This [Medium article](https://pguso.medium.com/a-beginners-guide-to-phpunit-writing-and-running-unit-tests-in-php-d0b23b96749f) might be a good starting point. Code changes without tests are very likely to be rejected. +* Fix reported issues with the coding style. We use **PHP-CS-Fixer** for this. See [.php-cs-fixer.php](./.php-cs-fixer.php) for more information about our coding styles. [Developer.md](./doc/Developer.md) contains more information about this topic. +* If you are fixing an **existing error**, refer to it in the introduction text of the pull request. For example, if you created a fix for issue `#1234` write the following Markdown: `fixes #1234`. +* In case you have changed **internal behavior/functionality**, check our documentation to make sure these changes are **correctly documented**: https://github.com/smalot/pdfparser/tree/master/doc diff --git a/LICENSE b/LICENSE deleted file mode 100644 index 94a9ed02..00000000 --- a/LICENSE +++ /dev/null @@ -1,674 +0,0 @@ - GNU GENERAL PUBLIC LICENSE - Version 3, 29 June 2007 - - Copyright (C) 2007 Free Software Foundation, Inc. - Everyone is permitted to copy and distribute verbatim copies - of this license document, but changing it is not allowed. - - Preamble - - The GNU General Public License is a free, copyleft license for -software and other kinds of works. - - The licenses for most software and other practical works are designed -to take away your freedom to share and change the works. By contrast, -the GNU General Public License is intended to guarantee your freedom to -share and change all versions of a program--to make sure it remains free -software for all its users. We, the Free Software Foundation, use the -GNU General Public License for most of our software; it applies also to -any other work released this way by its authors. You can apply it to -your programs, too. - - When we speak of free software, we are referring to freedom, not -price. Our General Public Licenses are designed to make sure that you -have the freedom to distribute copies of free software (and charge for -them if you wish), that you receive source code or can get it if you -want it, that you can change the software or use pieces of it in new -free programs, and that you know you can do these things. - - To protect your rights, we need to prevent others from denying you -these rights or asking you to surrender the rights. Therefore, you have -certain responsibilities if you distribute copies of the software, or if -you modify it: responsibilities to respect the freedom of others. - - For example, if you distribute copies of such a program, whether -gratis or for a fee, you must pass on to the recipients the same -freedoms that you received. You must make sure that they, too, receive -or can get the source code. And you must show them these terms so they -know their rights. - - Developers that use the GNU GPL protect your rights with two steps: -(1) assert copyright on the software, and (2) offer you this License -giving you legal permission to copy, distribute and/or modify it. - - For the developers' and authors' protection, the GPL clearly explains -that there is no warranty for this free software. For both users' and -authors' sake, the GPL requires that modified versions be marked as -changed, so that their problems will not be attributed erroneously to -authors of previous versions. - - Some devices are designed to deny users access to install or run -modified versions of the software inside them, although the manufacturer -can do so. This is fundamentally incompatible with the aim of -protecting users' freedom to change the software. The systematic -pattern of such abuse occurs in the area of products for individuals to -use, which is precisely where it is most unacceptable. Therefore, we -have designed this version of the GPL to prohibit the practice for those -products. If such problems arise substantially in other domains, we -stand ready to extend this provision to those domains in future versions -of the GPL, as needed to protect the freedom of users. - - Finally, every program is threatened constantly by software patents. -States should not allow patents to restrict development and use of -software on general-purpose computers, but in those that do, we wish to -avoid the special danger that patents applied to a free program could -make it effectively proprietary. To prevent this, the GPL assures that -patents cannot be used to render the program non-free. - - The precise terms and conditions for copying, distribution and -modification follow. - - TERMS AND CONDITIONS - - 0. Definitions. - - "This License" refers to version 3 of the GNU General Public License. - - "Copyright" also means copyright-like laws that apply to other kinds of -works, such as semiconductor masks. - - "The Program" refers to any copyrightable work licensed under this -License. Each licensee is addressed as "you". "Licensees" and -"recipients" may be individuals or organizations. - - To "modify" a work means to copy from or adapt all or part of the work -in a fashion requiring copyright permission, other than the making of an -exact copy. The resulting work is called a "modified version" of the -earlier work or a work "based on" the earlier work. - - A "covered work" means either the unmodified Program or a work based -on the Program. - - To "propagate" a work means to do anything with it that, without -permission, would make you directly or secondarily liable for -infringement under applicable copyright law, except executing it on a -computer or modifying a private copy. Propagation includes copying, -distribution (with or without modification), making available to the -public, and in some countries other activities as well. - - To "convey" a work means any kind of propagation that enables other -parties to make or receive copies. Mere interaction with a user through -a computer network, with no transfer of a copy, is not conveying. - - An interactive user interface displays "Appropriate Legal Notices" -to the extent that it includes a convenient and prominently visible -feature that (1) displays an appropriate copyright notice, and (2) -tells the user that there is no warranty for the work (except to the -extent that warranties are provided), that licensees may convey the -work under this License, and how to view a copy of this License. If -the interface presents a list of user commands or options, such as a -menu, a prominent item in the list meets this criterion. - - 1. Source Code. - - The "source code" for a work means the preferred form of the work -for making modifications to it. "Object code" means any non-source -form of a work. - - A "Standard Interface" means an interface that either is an official -standard defined by a recognized standards body, or, in the case of -interfaces specified for a particular programming language, one that -is widely used among developers working in that language. - - The "System Libraries" of an executable work include anything, other -than the work as a whole, that (a) is included in the normal form of -packaging a Major Component, but which is not part of that Major -Component, and (b) serves only to enable use of the work with that -Major Component, or to implement a Standard Interface for which an -implementation is available to the public in source code form. A -"Major Component", in this context, means a major essential component -(kernel, window system, and so on) of the specific operating system -(if any) on which the executable work runs, or a compiler used to -produce the work, or an object code interpreter used to run it. - - The "Corresponding Source" for a work in object code form means all -the source code needed to generate, install, and (for an executable -work) run the object code and to modify the work, including scripts to -control those activities. However, it does not include the work's -System Libraries, or general-purpose tools or generally available free -programs which are used unmodified in performing those activities but -which are not part of the work. For example, Corresponding Source -includes interface definition files associated with source files for -the work, and the source code for shared libraries and dynamically -linked subprograms that the work is specifically designed to require, -such as by intimate data communication or control flow between those -subprograms and other parts of the work. - - The Corresponding Source need not include anything that users -can regenerate automatically from other parts of the Corresponding -Source. - - The Corresponding Source for a work in source code form is that -same work. - - 2. Basic Permissions. - - All rights granted under this License are granted for the term of -copyright on the Program, and are irrevocable provided the stated -conditions are met. This License explicitly affirms your unlimited -permission to run the unmodified Program. The output from running a -covered work is covered by this License only if the output, given its -content, constitutes a covered work. This License acknowledges your -rights of fair use or other equivalent, as provided by copyright law. - - You may make, run and propagate covered works that you do not -convey, without conditions so long as your license otherwise remains -in force. You may convey covered works to others for the sole purpose -of having them make modifications exclusively for you, or provide you -with facilities for running those works, provided that you comply with -the terms of this License in conveying all material for which you do -not control copyright. Those thus making or running the covered works -for you must do so exclusively on your behalf, under your direction -and control, on terms that prohibit them from making any copies of -your copyrighted material outside their relationship with you. - - Conveying under any other circumstances is permitted solely under -the conditions stated below. Sublicensing is not allowed; section 10 -makes it unnecessary. - - 3. Protecting Users' Legal Rights From Anti-Circumvention Law. - - No covered work shall be deemed part of an effective technological -measure under any applicable law fulfilling obligations under article -11 of the WIPO copyright treaty adopted on 20 December 1996, or -similar laws prohibiting or restricting circumvention of such -measures. - - When you convey a covered work, you waive any legal power to forbid -circumvention of technological measures to the extent such circumvention -is effected by exercising rights under this License with respect to -the covered work, and you disclaim any intention to limit operation or -modification of the work as a means of enforcing, against the work's -users, your or third parties' legal rights to forbid circumvention of -technological measures. - - 4. Conveying Verbatim Copies. - - You may convey verbatim copies of the Program's source code as you -receive it, in any medium, provided that you conspicuously and -appropriately publish on each copy an appropriate copyright notice; -keep intact all notices stating that this License and any -non-permissive terms added in accord with section 7 apply to the code; -keep intact all notices of the absence of any warranty; and give all -recipients a copy of this License along with the Program. - - You may charge any price or no price for each copy that you convey, -and you may offer support or warranty protection for a fee. - - 5. Conveying Modified Source Versions. - - You may convey a work based on the Program, or the modifications to -produce it from the Program, in the form of source code under the -terms of section 4, provided that you also meet all of these conditions: - - a) The work must carry prominent notices stating that you modified - it, and giving a relevant date. - - b) The work must carry prominent notices stating that it is - released under this License and any conditions added under section - 7. This requirement modifies the requirement in section 4 to - "keep intact all notices". - - c) You must license the entire work, as a whole, under this - License to anyone who comes into possession of a copy. This - License will therefore apply, along with any applicable section 7 - additional terms, to the whole of the work, and all its parts, - regardless of how they are packaged. This License gives no - permission to license the work in any other way, but it does not - invalidate such permission if you have separately received it. - - d) If the work has interactive user interfaces, each must display - Appropriate Legal Notices; however, if the Program has interactive - interfaces that do not display Appropriate Legal Notices, your - work need not make them do so. - - A compilation of a covered work with other separate and independent -works, which are not by their nature extensions of the covered work, -and which are not combined with it such as to form a larger program, -in or on a volume of a storage or distribution medium, is called an -"aggregate" if the compilation and its resulting copyright are not -used to limit the access or legal rights of the compilation's users -beyond what the individual works permit. Inclusion of a covered work -in an aggregate does not cause this License to apply to the other -parts of the aggregate. - - 6. Conveying Non-Source Forms. - - You may convey a covered work in object code form under the terms -of sections 4 and 5, provided that you also convey the -machine-readable Corresponding Source under the terms of this License, -in one of these ways: - - a) Convey the object code in, or embodied in, a physical product - (including a physical distribution medium), accompanied by the - Corresponding Source fixed on a durable physical medium - customarily used for software interchange. - - b) Convey the object code in, or embodied in, a physical product - (including a physical distribution medium), accompanied by a - written offer, valid for at least three years and valid for as - long as you offer spare parts or customer support for that product - model, to give anyone who possesses the object code either (1) a - copy of the Corresponding Source for all the software in the - product that is covered by this License, on a durable physical - medium customarily used for software interchange, for a price no - more than your reasonable cost of physically performing this - conveying of source, or (2) access to copy the - Corresponding Source from a network server at no charge. - - c) Convey individual copies of the object code with a copy of the - written offer to provide the Corresponding Source. This - alternative is allowed only occasionally and noncommercially, and - only if you received the object code with such an offer, in accord - with subsection 6b. - - d) Convey the object code by offering access from a designated - place (gratis or for a charge), and offer equivalent access to the - Corresponding Source in the same way through the same place at no - further charge. You need not require recipients to copy the - Corresponding Source along with the object code. If the place to - copy the object code is a network server, the Corresponding Source - may be on a different server (operated by you or a third party) - that supports equivalent copying facilities, provided you maintain - clear directions next to the object code saying where to find the - Corresponding Source. Regardless of what server hosts the - Corresponding Source, you remain obligated to ensure that it is - available for as long as needed to satisfy these requirements. - - e) Convey the object code using peer-to-peer transmission, provided - you inform other peers where the object code and Corresponding - Source of the work are being offered to the general public at no - charge under subsection 6d. - - A separable portion of the object code, whose source code is excluded -from the Corresponding Source as a System Library, need not be -included in conveying the object code work. - - A "User Product" is either (1) a "consumer product", which means any -tangible personal property which is normally used for personal, family, -or household purposes, or (2) anything designed or sold for incorporation -into a dwelling. In determining whether a product is a consumer product, -doubtful cases shall be resolved in favor of coverage. For a particular -product received by a particular user, "normally used" refers to a -typical or common use of that class of product, regardless of the status -of the particular user or of the way in which the particular user -actually uses, or expects or is expected to use, the product. A product -is a consumer product regardless of whether the product has substantial -commercial, industrial or non-consumer uses, unless such uses represent -the only significant mode of use of the product. - - "Installation Information" for a User Product means any methods, -procedures, authorization keys, or other information required to install -and execute modified versions of a covered work in that User Product from -a modified version of its Corresponding Source. The information must -suffice to ensure that the continued functioning of the modified object -code is in no case prevented or interfered with solely because -modification has been made. - - If you convey an object code work under this section in, or with, or -specifically for use in, a User Product, and the conveying occurs as -part of a transaction in which the right of possession and use of the -User Product is transferred to the recipient in perpetuity or for a -fixed term (regardless of how the transaction is characterized), the -Corresponding Source conveyed under this section must be accompanied -by the Installation Information. But this requirement does not apply -if neither you nor any third party retains the ability to install -modified object code on the User Product (for example, the work has -been installed in ROM). - - The requirement to provide Installation Information does not include a -requirement to continue to provide support service, warranty, or updates -for a work that has been modified or installed by the recipient, or for -the User Product in which it has been modified or installed. Access to a -network may be denied when the modification itself materially and -adversely affects the operation of the network or violates the rules and -protocols for communication across the network. - - Corresponding Source conveyed, and Installation Information provided, -in accord with this section must be in a format that is publicly -documented (and with an implementation available to the public in -source code form), and must require no special password or key for -unpacking, reading or copying. - - 7. Additional Terms. - - "Additional permissions" are terms that supplement the terms of this -License by making exceptions from one or more of its conditions. -Additional permissions that are applicable to the entire Program shall -be treated as though they were included in this License, to the extent -that they are valid under applicable law. If additional permissions -apply only to part of the Program, that part may be used separately -under those permissions, but the entire Program remains governed by -this License without regard to the additional permissions. - - When you convey a copy of a covered work, you may at your option -remove any additional permissions from that copy, or from any part of -it. (Additional permissions may be written to require their own -removal in certain cases when you modify the work.) You may place -additional permissions on material, added by you to a covered work, -for which you have or can give appropriate copyright permission. - - Notwithstanding any other provision of this License, for material you -add to a covered work, you may (if authorized by the copyright holders of -that material) supplement the terms of this License with terms: - - a) Disclaiming warranty or limiting liability differently from the - terms of sections 15 and 16 of this License; or - - b) Requiring preservation of specified reasonable legal notices or - author attributions in that material or in the Appropriate Legal - Notices displayed by works containing it; or - - c) Prohibiting misrepresentation of the origin of that material, or - requiring that modified versions of such material be marked in - reasonable ways as different from the original version; or - - d) Limiting the use for publicity purposes of names of licensors or - authors of the material; or - - e) Declining to grant rights under trademark law for use of some - trade names, trademarks, or service marks; or - - f) Requiring indemnification of licensors and authors of that - material by anyone who conveys the material (or modified versions of - it) with contractual assumptions of liability to the recipient, for - any liability that these contractual assumptions directly impose on - those licensors and authors. - - All other non-permissive additional terms are considered "further -restrictions" within the meaning of section 10. If the Program as you -received it, or any part of it, contains a notice stating that it is -governed by this License along with a term that is a further -restriction, you may remove that term. If a license document contains -a further restriction but permits relicensing or conveying under this -License, you may add to a covered work material governed by the terms -of that license document, provided that the further restriction does -not survive such relicensing or conveying. - - If you add terms to a covered work in accord with this section, you -must place, in the relevant source files, a statement of the -additional terms that apply to those files, or a notice indicating -where to find the applicable terms. - - Additional terms, permissive or non-permissive, may be stated in the -form of a separately written license, or stated as exceptions; -the above requirements apply either way. - - 8. Termination. - - You may not propagate or modify a covered work except as expressly -provided under this License. Any attempt otherwise to propagate or -modify it is void, and will automatically terminate your rights under -this License (including any patent licenses granted under the third -paragraph of section 11). - - However, if you cease all violation of this License, then your -license from a particular copyright holder is reinstated (a) -provisionally, unless and until the copyright holder explicitly and -finally terminates your license, and (b) permanently, if the copyright -holder fails to notify you of the violation by some reasonable means -prior to 60 days after the cessation. - - Moreover, your license from a particular copyright holder is -reinstated permanently if the copyright holder notifies you of the -violation by some reasonable means, this is the first time you have -received notice of violation of this License (for any work) from that -copyright holder, and you cure the violation prior to 30 days after -your receipt of the notice. - - Termination of your rights under this section does not terminate the -licenses of parties who have received copies or rights from you under -this License. If your rights have been terminated and not permanently -reinstated, you do not qualify to receive new licenses for the same -material under section 10. - - 9. Acceptance Not Required for Having Copies. - - You are not required to accept this License in order to receive or -run a copy of the Program. Ancillary propagation of a covered work -occurring solely as a consequence of using peer-to-peer transmission -to receive a copy likewise does not require acceptance. However, -nothing other than this License grants you permission to propagate or -modify any covered work. These actions infringe copyright if you do -not accept this License. Therefore, by modifying or propagating a -covered work, you indicate your acceptance of this License to do so. - - 10. Automatic Licensing of Downstream Recipients. - - Each time you convey a covered work, the recipient automatically -receives a license from the original licensors, to run, modify and -propagate that work, subject to this License. You are not responsible -for enforcing compliance by third parties with this License. - - An "entity transaction" is a transaction transferring control of an -organization, or substantially all assets of one, or subdividing an -organization, or merging organizations. If propagation of a covered -work results from an entity transaction, each party to that -transaction who receives a copy of the work also receives whatever -licenses to the work the party's predecessor in interest had or could -give under the previous paragraph, plus a right to possession of the -Corresponding Source of the work from the predecessor in interest, if -the predecessor has it or can get it with reasonable efforts. - - You may not impose any further restrictions on the exercise of the -rights granted or affirmed under this License. For example, you may -not impose a license fee, royalty, or other charge for exercise of -rights granted under this License, and you may not initiate litigation -(including a cross-claim or counterclaim in a lawsuit) alleging that -any patent claim is infringed by making, using, selling, offering for -sale, or importing the Program or any portion of it. - - 11. Patents. - - A "contributor" is a copyright holder who authorizes use under this -License of the Program or a work on which the Program is based. The -work thus licensed is called the contributor's "contributor version". - - A contributor's "essential patent claims" are all patent claims -owned or controlled by the contributor, whether already acquired or -hereafter acquired, that would be infringed by some manner, permitted -by this License, of making, using, or selling its contributor version, -but do not include claims that would be infringed only as a -consequence of further modification of the contributor version. For -purposes of this definition, "control" includes the right to grant -patent sublicenses in a manner consistent with the requirements of -this License. - - Each contributor grants you a non-exclusive, worldwide, royalty-free -patent license under the contributor's essential patent claims, to -make, use, sell, offer for sale, import and otherwise run, modify and -propagate the contents of its contributor version. - - In the following three paragraphs, a "patent license" is any express -agreement or commitment, however denominated, not to enforce a patent -(such as an express permission to practice a patent or covenant not to -sue for patent infringement). To "grant" such a patent license to a -party means to make such an agreement or commitment not to enforce a -patent against the party. - - If you convey a covered work, knowingly relying on a patent license, -and the Corresponding Source of the work is not available for anyone -to copy, free of charge and under the terms of this License, through a -publicly available network server or other readily accessible means, -then you must either (1) cause the Corresponding Source to be so -available, or (2) arrange to deprive yourself of the benefit of the -patent license for this particular work, or (3) arrange, in a manner -consistent with the requirements of this License, to extend the patent -license to downstream recipients. "Knowingly relying" means you have -actual knowledge that, but for the patent license, your conveying the -covered work in a country, or your recipient's use of the covered work -in a country, would infringe one or more identifiable patents in that -country that you have reason to believe are valid. - - If, pursuant to or in connection with a single transaction or -arrangement, you convey, or propagate by procuring conveyance of, a -covered work, and grant a patent license to some of the parties -receiving the covered work authorizing them to use, propagate, modify -or convey a specific copy of the covered work, then the patent license -you grant is automatically extended to all recipients of the covered -work and works based on it. - - A patent license is "discriminatory" if it does not include within -the scope of its coverage, prohibits the exercise of, or is -conditioned on the non-exercise of one or more of the rights that are -specifically granted under this License. You may not convey a covered -work if you are a party to an arrangement with a third party that is -in the business of distributing software, under which you make payment -to the third party based on the extent of your activity of conveying -the work, and under which the third party grants, to any of the -parties who would receive the covered work from you, a discriminatory -patent license (a) in connection with copies of the covered work -conveyed by you (or copies made from those copies), or (b) primarily -for and in connection with specific products or compilations that -contain the covered work, unless you entered into that arrangement, -or that patent license was granted, prior to 28 March 2007. - - Nothing in this License shall be construed as excluding or limiting -any implied license or other defenses to infringement that may -otherwise be available to you under applicable patent law. - - 12. No Surrender of Others' Freedom. - - If conditions are imposed on you (whether by court order, agreement or -otherwise) that contradict the conditions of this License, they do not -excuse you from the conditions of this License. If you cannot convey a -covered work so as to satisfy simultaneously your obligations under this -License and any other pertinent obligations, then as a consequence you may -not convey it at all. For example, if you agree to terms that obligate you -to collect a royalty for further conveying from those to whom you convey -the Program, the only way you could satisfy both those terms and this -License would be to refrain entirely from conveying the Program. - - 13. Use with the GNU Affero General Public License. - - Notwithstanding any other provision of this License, you have -permission to link or combine any covered work with a work licensed -under version 3 of the GNU Affero General Public License into a single -combined work, and to convey the resulting work. The terms of this -License will continue to apply to the part which is the covered work, -but the special requirements of the GNU Affero General Public License, -section 13, concerning interaction through a network will apply to the -combination as such. - - 14. Revised Versions of this License. - - The Free Software Foundation may publish revised and/or new versions of -the GNU General Public License from time to time. Such new versions will -be similar in spirit to the present version, but may differ in detail to -address new problems or concerns. - - Each version is given a distinguishing version number. If the -Program specifies that a certain numbered version of the GNU General -Public License "or any later version" applies to it, you have the -option of following the terms and conditions either of that numbered -version or of any later version published by the Free Software -Foundation. If the Program does not specify a version number of the -GNU General Public License, you may choose any version ever published -by the Free Software Foundation. - - If the Program specifies that a proxy can decide which future -versions of the GNU General Public License can be used, that proxy's -public statement of acceptance of a version permanently authorizes you -to choose that version for the Program. - - Later license versions may give you additional or different -permissions. However, no additional obligations are imposed on any -author or copyright holder as a result of your choosing to follow a -later version. - - 15. Disclaimer of Warranty. - - THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY -APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT -HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY -OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, -THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM -IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF -ALL NECESSARY SERVICING, REPAIR OR CORRECTION. - - 16. Limitation of Liability. - - IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING -WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS -THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY -GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE -USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF -DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD -PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), -EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF -SUCH DAMAGES. - - 17. Interpretation of Sections 15 and 16. - - If the disclaimer of warranty and limitation of liability provided -above cannot be given local legal effect according to their terms, -reviewing courts shall apply local law that most closely approximates -an absolute waiver of all civil liability in connection with the -Program, unless a warranty or assumption of liability accompanies a -copy of the Program in return for a fee. - - END OF TERMS AND CONDITIONS - - How to Apply These Terms to Your New Programs - - If you develop a new program, and you want it to be of the greatest -possible use to the public, the best way to achieve this is to make it -free software which everyone can redistribute and change under these terms. - - To do so, attach the following notices to the program. It is safest -to attach them to the start of each source file to most effectively -state the exclusion of warranty; and each file should have at least -the "copyright" line and a pointer to where the full notice is found. - - - Copyright (C) - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see . - -Also add information on how to contact you by electronic and paper mail. - - If the program does terminal interaction, make it output a short -notice like this when it starts in an interactive mode: - - Copyright (C) - This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'. - This is free software, and you are welcome to redistribute it - under certain conditions; type `show c' for details. - -The hypothetical commands `show w' and `show c' should show the appropriate -parts of the General Public License. Of course, your program's commands -might be different; for a GUI interface, you would use an "about box". - - You should also get your employer (if you work as a programmer) or school, -if any, to sign a "copyright disclaimer" for the program, if necessary. -For more information on this, and how to apply and follow the GNU GPL, see -. - - The GNU General Public License does not permit incorporating your program -into proprietary programs. If your program is a subroutine library, you -may consider it more useful to permit linking proprietary applications with -the library. If this is what you want to do, use the GNU Lesser General -Public License instead of this License. But first, please read -. diff --git a/LICENSE.txt b/LICENSE.txt new file mode 100644 index 00000000..65c5ca88 --- /dev/null +++ b/LICENSE.txt @@ -0,0 +1,165 @@ + GNU LESSER GENERAL PUBLIC LICENSE + Version 3, 29 June 2007 + + Copyright (C) 2007 Free Software Foundation, Inc. + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + + This version of the GNU Lesser General Public License incorporates +the terms and conditions of version 3 of the GNU General Public +License, supplemented by the additional permissions listed below. + + 0. Additional Definitions. + + As used herein, "this License" refers to version 3 of the GNU Lesser +General Public License, and the "GNU GPL" refers to version 3 of the GNU +General Public License. + + "The Library" refers to a covered work governed by this License, +other than an Application or a Combined Work as defined below. + + An "Application" is any work that makes use of an interface provided +by the Library, but which is not otherwise based on the Library. +Defining a subclass of a class defined by the Library is deemed a mode +of using an interface provided by the Library. + + A "Combined Work" is a work produced by combining or linking an +Application with the Library. The particular version of the Library +with which the Combined Work was made is also called the "Linked +Version". + + The "Minimal Corresponding Source" for a Combined Work means the +Corresponding Source for the Combined Work, excluding any source code +for portions of the Combined Work that, considered in isolation, are +based on the Application, and not on the Linked Version. + + The "Corresponding Application Code" for a Combined Work means the +object code and/or source code for the Application, including any data +and utility programs needed for reproducing the Combined Work from the +Application, but excluding the System Libraries of the Combined Work. + + 1. Exception to Section 3 of the GNU GPL. + + You may convey a covered work under sections 3 and 4 of this License +without being bound by section 3 of the GNU GPL. + + 2. Conveying Modified Versions. + + If you modify a copy of the Library, and, in your modifications, a +facility refers to a function or data to be supplied by an Application +that uses the facility (other than as an argument passed when the +facility is invoked), then you may convey a copy of the modified +version: + + a) under this License, provided that you make a good faith effort to + ensure that, in the event an Application does not supply the + function or data, the facility still operates, and performs + whatever part of its purpose remains meaningful, or + + b) under the GNU GPL, with none of the additional permissions of + this License applicable to that copy. + + 3. Object Code Incorporating Material from Library Header Files. + + The object code form of an Application may incorporate material from +a header file that is part of the Library. You may convey such object +code under terms of your choice, provided that, if the incorporated +material is not limited to numerical parameters, data structure +layouts and accessors, or small macros, inline functions and templates +(ten or fewer lines in length), you do both of the following: + + a) Give prominent notice with each copy of the object code that the + Library is used in it and that the Library and its use are + covered by this License. + + b) Accompany the object code with a copy of the GNU GPL and this license + document. + + 4. Combined Works. + + You may convey a Combined Work under terms of your choice that, +taken together, effectively do not restrict modification of the +portions of the Library contained in the Combined Work and reverse +engineering for debugging such modifications, if you also do each of +the following: + + a) Give prominent notice with each copy of the Combined Work that + the Library is used in it and that the Library and its use are + covered by this License. + + b) Accompany the Combined Work with a copy of the GNU GPL and this license + document. + + c) For a Combined Work that displays copyright notices during + execution, include the copyright notice for the Library among + these notices, as well as a reference directing the user to the + copies of the GNU GPL and this license document. + + d) Do one of the following: + + 0) Convey the Minimal Corresponding Source under the terms of this + License, and the Corresponding Application Code in a form + suitable for, and under terms that permit, the user to + recombine or relink the Application with a modified version of + the Linked Version to produce a modified Combined Work, in the + manner specified by section 6 of the GNU GPL for conveying + Corresponding Source. + + 1) Use a suitable shared library mechanism for linking with the + Library. A suitable mechanism is one that (a) uses at run time + a copy of the Library already present on the user's computer + system, and (b) will operate properly with a modified version + of the Library that is interface-compatible with the Linked + Version. + + e) Provide Installation Information, but only if you would otherwise + be required to provide such information under section 6 of the + GNU GPL, and only to the extent that such information is + necessary to install and execute a modified version of the + Combined Work produced by recombining or relinking the + Application with a modified version of the Linked Version. (If + you use option 4d0, the Installation Information must accompany + the Minimal Corresponding Source and Corresponding Application + Code. If you use option 4d1, you must provide the Installation + Information in the manner specified by section 6 of the GNU GPL + for conveying Corresponding Source.) + + 5. Combined Libraries. + + You may place library facilities that are a work based on the +Library side by side in a single library together with other library +facilities that are not Applications and are not covered by this +License, and convey such a combined library under terms of your +choice, if you do both of the following: + + a) Accompany the combined library with a copy of the same work based + on the Library, uncombined with any other library facilities, + conveyed under the terms of this License. + + b) Give prominent notice with the combined library that part of it + is a work based on the Library, and explaining where to find the + accompanying uncombined form of the same work. + + 6. Revised Versions of the GNU Lesser General Public License. + + The Free Software Foundation may publish revised and/or new versions +of the GNU Lesser General Public License from time to time. Such new +versions will be similar in spirit to the present version, but may +differ in detail to address new problems or concerns. + + Each version is given a distinguishing version number. If the +Library as you received it specifies that a certain numbered version +of the GNU Lesser General Public License "or any later version" +applies to it, you have the option of following the terms and +conditions either of that published version or of any later version +published by the Free Software Foundation. If the Library as you +received it does not specify a version number of the GNU Lesser +General Public License, you may choose any version of the GNU Lesser +General Public License ever published by the Free Software Foundation. + + If the Library as you received it specifies that a proxy can decide +whether future versions of the GNU Lesser General Public License shall +apply, that proxy's public statement of acceptance of any version is +permanent authorization for you to choose that version for the +Library. diff --git a/Makefile b/Makefile new file mode 100644 index 00000000..61406039 --- /dev/null +++ b/Makefile @@ -0,0 +1,11 @@ +install-dev-tools: + composer update --working-dir=dev-tools + +run-php-cs-fixer: + dev-tools/vendor/bin/php-cs-fixer fix $(ARGS) + +run-phpstan: + dev-tools/vendor/bin/phpstan analyze $(ARGS) + +run-phpunit: + dev-tools/vendor/bin/phpunit $(ARGS) diff --git a/README.md b/README.md index 294451f4..febcf303 100644 --- a/README.md +++ b/README.md @@ -1,40 +1,59 @@ -# PdfParser # +# PDF parser -Pdf Parser, a standalone PHP library, provides various tools to extract data from a PDF file. +[![Version](https://poser.pugx.org/smalot/pdfparser/v)](//packagist.org/packages/smalot/pdfparser) +![CI](https://github.com/smalot/pdfparser/workflows/CI/badge.svg) +![CS](https://github.com/smalot/pdfparser/workflows/CS/badge.svg) +[![Scrutinizer Code Quality](https://scrutinizer-ci.com/g/smalot/pdfparser/badges/quality-score.png?b=master)](https://scrutinizer-ci.com/g/smalot/pdfparser/?branch=master) +[![Downloads](https://poser.pugx.org/smalot/pdfparser/downloads)](//packagist.org/packages/smalot/pdfparser) -[![Build Status](https://travis-ci.org/smalot/pdfparser.png?branch=master)](https://travis-ci.org/smalot/pdfparser) -[![Total Downloads](https://poser.pugx.org/smalot/pdfparser/downloads.png)](https://packagist.org/packages/smalot/pdfparser) -[![Current Version](https://poser.pugx.org/smalot/pdfparser/v/stable.png)](https://packagist.org/packages/smalot/pdfparser) +The `smalot/pdfparser` is a standalone PHP package that provides various tools to extract data from PDF files. -Website : [http://www.pdfparser.org](http://www.pdfparser.org/?utm_source=GitHub&utm_medium=website&utm_campaign=GitHub) +This library is under **active maintenance**. +There is no active development by the author of this library (at the moment), but we welcome any pull request adding/extending functionality! +See [CONTRIBUTING.md](./CONTRIBUTING.md) for further information about how to contribute. -Test the API on our [demo page](http://www.pdfparser.org/demo). - -This project is supported by [Actualys](http://www.actualys.com). - -## Features ## - -Features included : +## Features - Load/parse objects and headers -- Extract meta data (author, description, ...) +- Extract metadata (author, description, ...) - Extract text from ordered pages -- Support of compressed pdf +- Support of compressed PDFs - Support of MAC OS Roman charset encoding - Handling of hexa and octal encoding in text sections -- PSR-0 compliant ([autoloader](https://github.com/php-fig/fig-standards/blob/master/accepted/PSR-0.md)) -- PSR-1 compliant ([code styling](https://github.com/php-fig/fig-standards/blob/master/accepted/PSR-1-basic-coding-standard.md)) +- Create custom configurations (see [CustomConfig.md](/doc/CustomConfig.md)). + +Currently, secured documents and extracting form data are not supported. + +## License + +This library is under the [LGPLv3 license](https://github.com/smalot/pdfparser/blob/master/LICENSE.txt). + +## Install + +This library requires PHP 7.1+ since [v1](https://github.com/smalot/pdfparser/releases/tag/v1.0.0). +You can install it via [Composer](https://getcomposer.org/): + +```bash +composer require smalot/pdfparser +``` + +In case you can't use Composer, you can include `alt_autoload.php-dist`. It will include all required files automatically. -Currently, secured documents are not supported. +## Quick example -This Library is still under active development. -As a result, users must expect BC breaks when using the master version. +```php +parseFile('/path/to/document.pdf'); -[Read the documentation on website](http://www.pdfparser.org/documentation?utm_source=GitHub&utm_medium=documentation&utm_campaign=GitHub). +$text = $pdf->getText(); +echo $text; +``` -## License ## +Further usage information can be found [here](/doc/Usage.md). -This library is under the [GPLv3 license](https://github.com/smalot/pdfparser/blob/master/LICENSE). +## Documentation +Documentation can be found in the [doc](/doc) folder. diff --git a/alt_autoload.php-dist b/alt_autoload.php-dist new file mode 100644 index 00000000..366c57c6 --- /dev/null +++ b/alt_autoload.php-dist @@ -0,0 +1,75 @@ + + * @date 2021-02-09 + * + * @license LGPLv3 + * @url + * + * PdfParser is a pdf library written in PHP, extraction oriented. + * Copyright (C) 2017 - Sébastien MALOT + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program. + * If not, see . + * + * -------------------------------------------------------------------------------------- + * + * About: + * This file provides an alternative to the Composer-approach. + * Include it into your project and all required files of PDFParser will be loaded automatically. + * Please use it only, if Composer is not available. + * + * How to use: + * 1. include this file as it is OR copy and rename it as you like (and then include it) + * 2. afterwards you can use PDFParser classes + * Done. + */ + +/** + * Loads all files found in a given folder. + * Calls itself recursively for all sub folders. + * + * @param string $dir + */ +function requireFilesOfFolder($dir) +{ + foreach (new DirectoryIterator($dir) as $fileInfo) { + if (!$fileInfo->isDot()) { + if ($fileInfo->isDir()) { + requireFilesOfFolder($fileInfo->getPathname()); + } else { + require_once $fileInfo->getPathname(); + } + } + } +} + +$rootFolder = __DIR__.'/src/Smalot/PdfParser'; + +// Manually require files, which can't be loaded automatically that easily. +require_once $rootFolder.'/Element.php'; +require_once $rootFolder.'/PDFObject.php'; +require_once $rootFolder.'/Font.php'; +require_once $rootFolder.'/Page.php'; +require_once $rootFolder.'/Element/ElementString.php'; +require_once $rootFolder.'/Encoding/AbstractEncoding.php'; + +/* + * Load the rest of PDFParser files from /src/Smalot/PDFParser + * Dont worry, it wont load files multiple times. + */ +requireFilesOfFolder($rootFolder); diff --git a/composer.json b/composer.json index b0775404..e9f425fd 100644 --- a/composer.json +++ b/composer.json @@ -1,31 +1,36 @@ { "name": "smalot/pdfparser", "description": "Pdf parser library. Can read and extract information from pdf file.", - "license": "GPL-3.0", + "keywords": ["PDF", "text", "parser", "parse", "extract"], + "type": "library", + "license": "LGPL-3.0", "authors": [ { "name": "Sebastien MALOT", - "email": "sebastien@malot.fr", - "role": "Developer", - "homepage": "http://www.malot.fr" + "email": "sebastien@malot.fr" } ], "support": { "issues": "https://github.com/smalot/pdfparser/issues" }, - "homepage": "http://www.pdfparser.org", + "homepage": "https://www.pdfparser.org", "require": { - "php": ">=5.3.0", - "tecnick.com/tcpdf": ">=6.0.050" - }, - "require-dev": { - "atoum/atoum": "dev-master" + "php": ">=7.1", + "symfony/polyfill-mbstring": "^1.18", + "ext-zlib": "*", + "ext-iconv": "*" }, "autoload": { "psr-0": { "Smalot\\PdfParser\\": "src/" } }, + "autoload-dev": { + "psr-4": { + "PerformanceTests\\": "tests/Performance/", + "PHPUnitTests\\": "tests/PHPUnit/" + } + }, "config": { "process-timeout": 1200 } diff --git a/dev-tools/composer.json b/dev-tools/composer.json new file mode 100644 index 00000000..65d85927 --- /dev/null +++ b/dev-tools/composer.json @@ -0,0 +1,9 @@ +{ + "description": "This file provides development-only dependencies.", + "require-dev": { + "friendsofphp/php-cs-fixer": "^3", + "phpstan/phpstan": "^1", + "phpstan/phpstan-phpunit": "^1", + "phpunit/phpunit": ">=7.5 <11.0" + } +} diff --git a/doc/CustomConfig.md b/doc/CustomConfig.md new file mode 100644 index 00000000..d3f8ad57 --- /dev/null +++ b/doc/CustomConfig.md @@ -0,0 +1,80 @@ +# Configuring the behavior of the parser + +To change the behavior of the parser, create a `Config` object and pass it to the parser. +In this case, we're setting the font space limit. +Changing this value can be helpful when `getText()` returns a text with too many spaces. + +```php +$config = new \Smalot\PdfParser\Config(); +$config->setFontSpaceLimit(-60); +$parser = new \Smalot\PdfParser\Parser([], $config); +$pdf = $parser->parseFile('document.pdf'); +// output extracted text +// echo $pdf->getText(); +``` + +## Config options overview + +The `Config` class has the following options: + +| Option | Type | Default | Description | +|--------------------------|---------|-----------------|------------------------------------------------------------------------------------------------------------------------------------------------------| +| `setDecodeMemoryLimit` | Integer | `0` | If parsing fails because of memory exhaustion, you can set a lower memory limit for decoding operations. | +| `setFontSpaceLimit` | Integer | `-50` | Changing font space limit can be helpful when `Parser::getText()` returns a text with too many spaces. | +| `setIgnoreEncryption` | Boolean | `false` | Read PDFs that are not encrypted but have the encryption flag set. This is a temporary workaround, don't rely on it. | +| `setHorizontalOffset` | String | ` ` | When words are broken up or when the structure of a table is not preserved, you may get better results when adapting `setHorizontalOffset`. | +| `setPdfWhitespaces` | String | `\0\t\n\f\r ` | | +| `setPdfWhitespacesRegex` | String | `[\0\t\n\f\r ]` | | +| `setRetainImageContent` | Boolean | `true` | If parsing fails due to memory exhaustion, you can set the value to `false`. This will reduce memory usage, although it will no longer retain image content. | + + +## option setDecodeMemoryLimit + setRetainImageContent (manage memory usage) + +If parsing fails because of memory exhaustion, you can use the following options. + +```php +$config = new \Smalot\PdfParser\Config(); +// Whether to retain raw image data as content or discard it to save memory +$config->setRetainImageContent(false); +// Memory limit to use when de-compressing files, in bytes +$config->setDecodeMemoryLimit(1000000); +$parser = new \Smalot\PdfParser\Parser([], $config); +``` + +## option setHorizontalOffset + +When words are broken up or when the structure of a table is not preserved, you can use `setHorizontalOffset`. + +```php +$config = new \Smalot\PdfParser\Config(); +// An empty string can prevent words from breaking up +$config->setHorizontalOffset(''); +// A tab can help preserve the structure of your document +$config->setHorizontalOffset("\t"); +$parser = new \Smalot\PdfParser\Parser([], $config); +``` + +## option setFontSpaceLimit + +Changing font space limit can be helpful when `getText()` returns a text with too many spaces. + +```php +$config = new \Smalot\PdfParser\Config(); +$config->setFontSpaceLimit(-60); +$parser = new \Smalot\PdfParser\Parser([], $config); +$pdf = $parser->parseFile('document.pdf'); +``` + +## option setIgnoreEncryption + +In some cases PDF files may be internally marked as encrypted even though the content is not encrypted and can be read. +This can be caused by the PDF being created by a tool that does not properly set the encryption flag. +If you are sure that the PDF is not encrypted, you can ignore the encryption flag by setting the `ignoreEncryption` flag to `true` in a custom `Config` instance. + +```php +$config = new \Smalot\PdfParser\Config(); +$config->setIgnoreEncryption(true); + +$parser = new \Smalot\PdfParser\Parser([], $config); +$pdf = $parser->parseFile('document.pdf'); +``` diff --git a/doc/Developer.md b/doc/Developer.md new file mode 100644 index 00000000..e108a82e --- /dev/null +++ b/doc/Developer.md @@ -0,0 +1,57 @@ +# Developers + +Here you will find information about our development tools and how to use them. + +## .editorconfig + +Please make sure your editor uses our `.editorconfig` file. It contains rules about our coding styles. + +## GitHub Action Workflows + +We use GitHub Actions to run our continuous integration as well as other tasks after pushing changes. +You will find related files in `.github/workflows/`. + +## Development Tools and Tests + +Our test related files are located in `tests` folder. +Tests are written using PHPUnit. + +To install (and update) development tools like PHPUnit or PHP-CS-Fixer run: + +```bash +make install-dev-tools +``` + +Development tools are getting installed in `dev-tools/vendor`. +Please check `dev-tools/composer.json` for more information about versions etc. +To run a tool manually, you use `dev-tools/vendor/bin`, for instance: + +```bash +dev-tools/vendor/bin/php-cs-fixer fix --verbose --dry-run +``` + +Below are a few shortcuts to improve your developer experience. + +### PHPUnit + +To run all tests run: + +```bash +make run-phpunit +``` + +### PHP-CS-Fixer + +To check coding styles, run: + +```bash +make run-php-cs-fixer +``` + +### PHPStan + +To run a static code analysis, use: + +```bash +make run-phpstan +``` diff --git a/doc/Usage.md b/doc/Usage.md new file mode 100644 index 00000000..787c79fe --- /dev/null +++ b/doc/Usage.md @@ -0,0 +1,248 @@ +# Usage + +First create a parser object and point it to a file. + +```php +$parser = new \Smalot\PdfParser\Parser(); + +$pdf = $parser->parseFile('document.pdf'); +// .. or ... +$pdf = $parser->parseContent(file_get_contents('document.pdf')) + ``` + +## Extract text + +A common scenario is to extract text. + +```php +// extract text of the whole PDF +$text = $pdf->getText(); + +// or extract the text of a specific page (in this case the first page) +$text = $pdf->getPages()[0]->getText(); + +// you can also extract text of a limited amount of pages. here, it will only use the first five pages. +$text = $pdf->getText(5); +``` + +## Extract text positions + +You can extract transformation matrix (indexes 0-3) and x,y position of text objects (indexes 4,5). + +```php +$data = $pdf->getPages()[0]->getDataTm(); + +Array +( + [0] => Array + ( + [0] => Array + ( + [0] => 0.999429 + [1] => 0 + [2] => 0 + [3] => 1 + [4] => 201.96 + [5] => 720.68 + ) + + [1] => Document title + ) + + [1] => Array + ( + [0] => Array + ( + [0] => 0.999402 + [1] => 0 + [2] => 0 + [3] => 1 + [4] => 70.8 + [5] => 673.64 + ) + + [1] => Calibri : Lorem ipsum dolor sit amet, consectetur a + ) +) +``` + +When activated via Config setting (`Config::setDataTmFontInfoHasToBeIncluded(true)`) font identifier (index 2) and font size (index 3) are added to dataTm. + +```php +// create config +$config = new Smalot\PdfParser\Config(); +$config->setDataTmFontInfoHasToBeIncluded(true); + +// use config and parse file +$parser = new Smalot\PdfParser\Parser([], $config); +$pdf = $parser->parseFile('document.pdf'); +$firstpage = $pdf->getPages()[0]; +$data = $firstpage->getDataTm(); + +Array +( + [0] => Array + ( + [0] => Array + ( + [0] => 0.999429 + [1] => 0 + [2] => 0 + [3] => 1 + [4] => 201.96 + [5] => 720.68 + ) + + [1] => Document title + [2] => R7 + [3] => 27.96 + ) + + [1] => Array + ( + [0] => Array + ( + [0] => 0.999402 + [1] => 0 + [2] => 0 + [3] => 1 + [4] => 70.8 + [5] => 673.64 + ) + + [1] => Calibri : Lorem ipsum dolor sit amet, consectetur a + [2] => R9 + [3] => 11.04 + ) +) +``` + +Text width should be calculated on text from dataTm to make sure all character widths are available. +In next example we are using data from above. + +```php +$font_id = $data[0][2]; //R7 +$font = $firstpage->getFont($font_id); +$text = $data[0][1]; +$width = $font->calculateTextWidth($text, $missing); +``` + +## Extract metadata + +You can also extract metadata. The available data varies from PDF to PDF. + +```php +$metaData = $pdf->getDetails(); + +Array +( + [Producer] => Adobe Acrobat + [CreatedOn] => 2022-01-28T16:36:11+00:00 + [Pages] => 35 + ... +) +``` + +If the PDF contains Extensible Metadata Platform (XMP) XML metadata, their values, including the XMP namespace, will be appended to the data returned by `getDetails()`. You can read more about what values and namespaces are commonly used in the [XMP Specifications](https://github.com/adobe/XMP-Toolkit-SDK/tree/main/docs). + +```php +Array +( + ... + [Pages] => 35 + [dc:creator] => My Name + [pdf:producer] => Adobe Acrobat + [dc:title] => My Document Title + ... +) +``` + +Some XMP metadata values may have multiple values, or even named children with their own values. In these cases, the value will be an array. The XMP metadata will follow the structure of the XML so it is possible to have multiple levels of nested values. + +```php +Array +( + ... + [dc:title] => My Document Title + [xmptpg:maxpagesize] => Array + ( + [stdim:w] => 21.500000 + [stdim:h] => 6.222222 + [stdim:unit] => Inches + ) + [xmptpg:platenames] => Array + ( + [0] => Cyan + [1] => Magenta + [2] => Yellow + [3] => Black + ) + ... +) +``` + + +## Read Base64 encoded PDFs + +If working with [Base64](https://en.wikipedia.org/wiki/Base64) encoded PDFs, you might want to parse the PDF without saving the file to disk. +This sample will parse the Base64 encoded PDF and extract text from each page. + +```php +parseContent(base64_decode($base64PDF)); + +$text = $pdf->getText(); +echo $text; +``` + +## Calculate text width + +Try to calculate text width for given font. +Characters without width are added to `$missing` array in second parameter. + +```php +$parser = new \Smalot\PdfParser\Parser(); +$pdf = $parser->parseFile('document.pdf'); +$fonts = $pdf->getFonts(); +// get first font (we assume here there is at least one) +$font = reset($fonts); +// get width +$width = $font->calculateTextWidth('Some text', $missing); +``` + +## Get pages width and height + +Ref: [#472](https://github.com/smalot/pdfparser/issues/427#issuecomment-973416786) + +```php +$parser = new \Smalot\PdfParser\Parser(); +$pdf = $parser->parseFile('document.pdf'); +$pages = $pdf->getPages(); +// this variable will contain the height and width of each page of the given PDF +$mediaBox = []; +foreach ($pages as $page) { + $details = $page->getDetails(); + // If Mediabox is not set in details of current $page instance, get details from the header instead + if (!isset($details['MediaBox'])) { + $pages = $pdf->getObjectsByType('Pages'); + $details = reset($pages)->getHeader()->getDetails(); + } + $mediaBox[] = [ + 'width' => $details['MediaBox'][2], + 'height' => $details['MediaBox'][3] + ]; +} +``` + +## PDF encryption + +This library cannot currently read encrypted PDF files, i.e. those with +a read password. Attempting to do so produces this error: +``` +Exception: Secured pdf file are currently not supported. +``` + +See `setIgnoreEncryption` option in [CustomConfig.md](CustomConfig.md) +for how to override the check in specific cases. diff --git a/docs/README.md b/docs/README.md deleted file mode 100644 index 3378b2ba..00000000 --- a/docs/README.md +++ /dev/null @@ -1,3 +0,0 @@ -# Documentation # - -PDF References files has been downloaded from this url : http://www.adobe.com/devnet/pdf/pdf_reference_archive.html diff --git a/phpstan.neon b/phpstan.neon new file mode 100644 index 00000000..8dd084ae --- /dev/null +++ b/phpstan.neon @@ -0,0 +1,12 @@ +includes: + - dev-tools/vendor/phpstan/phpstan-phpunit/extension.neon + - dev-tools/vendor/phpstan/phpstan-phpunit/rules.neon + +parameters: + level: 3 + paths: + - src + - tests + + bootstrapFiles: + - vendor/autoload.php diff --git a/phpunit-windows.xml b/phpunit-windows.xml new file mode 100644 index 00000000..ca794e98 --- /dev/null +++ b/phpunit-windows.xml @@ -0,0 +1,21 @@ + + + + + + src + + + + + + + + + + + + tests/PHPUnit + + + diff --git a/phpunit.xml b/phpunit.xml new file mode 100644 index 00000000..52673fe8 --- /dev/null +++ b/phpunit.xml @@ -0,0 +1,21 @@ + + + + + + + + + + + + + tests/PHPUnit + + + + + src + + + diff --git a/samples/Document-Word-Landscape-printedaspdf.pdf b/samples/Document-Word-Landscape-printedaspdf.pdf new file mode 100644 index 00000000..a1dd549a Binary files /dev/null and b/samples/Document-Word-Landscape-printedaspdf.pdf differ diff --git a/samples/DocumentWithLotsOfObjects.pdf b/samples/DocumentWithLotsOfObjects.pdf new file mode 100644 index 00000000..2c36e912 Binary files /dev/null and b/samples/DocumentWithLotsOfObjects.pdf differ diff --git a/samples/ImproperFontFallback.pdf b/samples/ImproperFontFallback.pdf new file mode 100644 index 00000000..2f6669d8 Binary files /dev/null and b/samples/ImproperFontFallback.pdf differ diff --git a/samples/InternationalChars.pdf b/samples/InternationalChars.pdf new file mode 100644 index 00000000..fd5eb667 Binary files /dev/null and b/samples/InternationalChars.pdf differ diff --git a/samples/SimpleInvoiceFilledExample1.pdf b/samples/SimpleInvoiceFilledExample1.pdf new file mode 100644 index 00000000..58c0da09 Binary files /dev/null and b/samples/SimpleInvoiceFilledExample1.pdf differ diff --git a/samples/SimpleInvoiceFilledExample2.pdf b/samples/SimpleInvoiceFilledExample2.pdf new file mode 100644 index 00000000..511c1802 Binary files /dev/null and b/samples/SimpleInvoiceFilledExample2.pdf differ diff --git a/samples/XMP_Metadata.pdf b/samples/XMP_Metadata.pdf new file mode 100644 index 00000000..98fc5cf6 Binary files /dev/null and b/samples/XMP_Metadata.pdf differ diff --git a/samples/bugs/Issue104a.pdf b/samples/bugs/Issue104a.pdf new file mode 100644 index 00000000..d51d7e19 Binary files /dev/null and b/samples/bugs/Issue104a.pdf differ diff --git a/samples/bugs/Issue18.pdf b/samples/bugs/Issue18.pdf new file mode 100644 index 00000000..d2fe0c63 Binary files /dev/null and b/samples/bugs/Issue18.pdf differ diff --git a/samples/bugs/Issue202.pdf b/samples/bugs/Issue202.pdf new file mode 100644 index 00000000..fada3cf6 Binary files /dev/null and b/samples/bugs/Issue202.pdf differ diff --git a/samples/bugs/Issue229_mac_roman_encoding.pdf b/samples/bugs/Issue229_mac_roman_encoding.pdf new file mode 100644 index 00000000..cee808de Binary files /dev/null and b/samples/bugs/Issue229_mac_roman_encoding.pdf differ diff --git a/samples/bugs/Issue267_array_access_on_int.pdf b/samples/bugs/Issue267_array_access_on_int.pdf new file mode 100644 index 00000000..a8149ebd Binary files /dev/null and b/samples/bugs/Issue267_array_access_on_int.pdf differ diff --git a/samples/bugs/Issue322.pdf b/samples/bugs/Issue322.pdf new file mode 100644 index 00000000..1a2b4b28 Binary files /dev/null and b/samples/bugs/Issue322.pdf differ diff --git a/samples/bugs/Issue33.pdf b/samples/bugs/Issue33.pdf new file mode 100644 index 00000000..6673bfdc Binary files /dev/null and b/samples/bugs/Issue33.pdf differ diff --git a/samples/bugs/Issue331.pdf b/samples/bugs/Issue331.pdf new file mode 100644 index 00000000..17c8765b Binary files /dev/null and b/samples/bugs/Issue331.pdf differ diff --git a/samples/bugs/Issue334.pdf b/samples/bugs/Issue334.pdf new file mode 100644 index 00000000..c944b9df Binary files /dev/null and b/samples/bugs/Issue334.pdf differ diff --git a/samples/bugs/Issue336_decode_hexadecimal.pdf b/samples/bugs/Issue336_decode_hexadecimal.pdf new file mode 100644 index 00000000..49ddff70 Binary files /dev/null and b/samples/bugs/Issue336_decode_hexadecimal.pdf differ diff --git a/samples/bugs/Issue356.pdf b/samples/bugs/Issue356.pdf new file mode 100644 index 00000000..7015a591 Binary files /dev/null and b/samples/bugs/Issue356.pdf differ diff --git a/samples/bugs/Issue359.pdf b/samples/bugs/Issue359.pdf new file mode 100644 index 00000000..69276b9d Binary files /dev/null and b/samples/bugs/Issue359.pdf differ diff --git a/samples/bugs/Issue391.pdf b/samples/bugs/Issue391.pdf new file mode 100644 index 00000000..3294f13a Binary files /dev/null and b/samples/bugs/Issue391.pdf differ diff --git a/samples/bugs/Issue398.pdf b/samples/bugs/Issue398.pdf new file mode 100644 index 00000000..3cd75baa Binary files /dev/null and b/samples/bugs/Issue398.pdf differ diff --git a/samples/bugs/Issue405.pdf b/samples/bugs/Issue405.pdf new file mode 100644 index 00000000..75489957 Binary files /dev/null and b/samples/bugs/Issue405.pdf differ diff --git a/samples/bugs/Issue450.pdf b/samples/bugs/Issue450.pdf new file mode 100644 index 00000000..86997d07 Binary files /dev/null and b/samples/bugs/Issue450.pdf differ diff --git a/samples/bugs/Issue454.pdf b/samples/bugs/Issue454.pdf new file mode 100644 index 00000000..ec681890 Binary files /dev/null and b/samples/bugs/Issue454.pdf differ diff --git a/samples/bugs/Issue479.pdf b/samples/bugs/Issue479.pdf new file mode 100644 index 00000000..b304d677 Binary files /dev/null and b/samples/bugs/Issue479.pdf differ diff --git a/samples/bugs/Issue494.pdf b/samples/bugs/Issue494.pdf new file mode 100644 index 00000000..7c4bfbd4 Binary files /dev/null and b/samples/bugs/Issue494.pdf differ diff --git a/samples/bugs/Issue557.pdf b/samples/bugs/Issue557.pdf new file mode 100644 index 00000000..6f9ab07e Binary files /dev/null and b/samples/bugs/Issue557.pdf differ diff --git a/samples/bugs/Issue585.pdf b/samples/bugs/Issue585.pdf new file mode 100644 index 00000000..b282c108 Binary files /dev/null and b/samples/bugs/Issue585.pdf differ diff --git a/samples/bugs/Issue592.pdf b/samples/bugs/Issue592.pdf new file mode 100644 index 00000000..3fe531e9 Binary files /dev/null and b/samples/bugs/Issue592.pdf differ diff --git a/samples/bugs/Issue608.pdf b/samples/bugs/Issue608.pdf new file mode 100644 index 00000000..6e1e9f58 Binary files /dev/null and b/samples/bugs/Issue608.pdf differ diff --git a/samples/bugs/Issue609.pdf b/samples/bugs/Issue609.pdf new file mode 100644 index 00000000..f40e7e12 Binary files /dev/null and b/samples/bugs/Issue609.pdf differ diff --git a/samples/bugs/Issue621.pdf b/samples/bugs/Issue621.pdf new file mode 100644 index 00000000..7611cd26 Binary files /dev/null and b/samples/bugs/Issue621.pdf differ diff --git a/samples/bugs/Issue629.pdf b/samples/bugs/Issue629.pdf new file mode 100644 index 00000000..be61f28a Binary files /dev/null and b/samples/bugs/Issue629.pdf differ diff --git a/samples/bugs/Issue665.pdf b/samples/bugs/Issue665.pdf new file mode 100644 index 00000000..b72d3e33 Binary files /dev/null and b/samples/bugs/Issue665.pdf differ diff --git a/samples/bugs/Issue668.pdf b/samples/bugs/Issue668.pdf new file mode 100644 index 00000000..dc502543 Binary files /dev/null and b/samples/bugs/Issue668.pdf differ diff --git a/samples/bugs/Issue673.pdf b/samples/bugs/Issue673.pdf new file mode 100644 index 00000000..a2138b51 Binary files /dev/null and b/samples/bugs/Issue673.pdf differ diff --git a/samples/bugs/Issue727.pdf b/samples/bugs/Issue727.pdf new file mode 100644 index 00000000..1d2f89ae Binary files /dev/null and b/samples/bugs/Issue727.pdf differ diff --git a/samples/bugs/Issue95_ANSI.pdf b/samples/bugs/Issue95_ANSI.pdf new file mode 100644 index 00000000..64320dc8 Binary files /dev/null and b/samples/bugs/Issue95_ANSI.pdf differ diff --git a/samples/bugs/PullRequest457.pdf b/samples/bugs/PullRequest457.pdf new file mode 100644 index 00000000..77769e86 Binary files /dev/null and b/samples/bugs/PullRequest457.pdf differ diff --git a/samples/bugs/PullRequest500.pdf b/samples/bugs/PullRequest500.pdf new file mode 100644 index 00000000..11134b44 Binary files /dev/null and b/samples/bugs/PullRequest500.pdf differ diff --git a/samples/corrupted.pdf b/samples/corrupted.pdf new file mode 100644 index 00000000..4f8189b9 Binary files /dev/null and b/samples/corrupted.pdf differ diff --git a/samples/grouped-by-generator/Document_Generated_by_SmallPDF.pdf b/samples/grouped-by-generator/Document_Generated_by_SmallPDF.pdf new file mode 100644 index 00000000..55a681d1 Binary files /dev/null and b/samples/grouped-by-generator/Document_Generated_by_SmallPDF.pdf differ diff --git a/samples/grouped-by-generator/Mostly_German_Text_Generated_By_Microsoft_Word_2016.pdf b/samples/grouped-by-generator/Mostly_German_Text_Generated_By_Microsoft_Word_2016.pdf new file mode 100644 index 00000000..ad2a0e2d Binary files /dev/null and b/samples/grouped-by-generator/Mostly_German_Text_Generated_By_Microsoft_Word_2016.pdf differ diff --git a/samples/grouped-by-generator/Mostly_German_Text_Generated_by_Power_PDF_Create.pdf b/samples/grouped-by-generator/Mostly_German_Text_Generated_by_Power_PDF_Create.pdf new file mode 100644 index 00000000..8557fe5b Binary files /dev/null and b/samples/grouped-by-generator/Mostly_German_Text_Generated_by_Power_PDF_Create.pdf differ diff --git "a/samples/grouped-by-generator/Mostly_Turkish_Text_Generated_by_Word_i\303\247in_Acrobat_PDFMaker_17.pdf" "b/samples/grouped-by-generator/Mostly_Turkish_Text_Generated_by_Word_i\303\247in_Acrobat_PDFMaker_17.pdf" new file mode 100644 index 00000000..d2c40e1d Binary files /dev/null and "b/samples/grouped-by-generator/Mostly_Turkish_Text_Generated_by_Word_i\303\247in_Acrobat_PDFMaker_17.pdf" differ diff --git a/samples/grouped-by-generator/R2RML-Spec_Generated_by_Chromium-SaveAs-PDF.pdf b/samples/grouped-by-generator/R2RML-Spec_Generated_by_Chromium-SaveAs-PDF.pdf new file mode 100644 index 00000000..1220fe85 Binary files /dev/null and b/samples/grouped-by-generator/R2RML-Spec_Generated_by_Chromium-SaveAs-PDF.pdf differ diff --git a/samples/grouped-by-generator/RichDocument_Generated_by_Libreoffice-6.4_PDF-v1.4.pdf b/samples/grouped-by-generator/RichDocument_Generated_by_Libreoffice-6.4_PDF-v1.4.pdf new file mode 100644 index 00000000..1ac0eb3c Binary files /dev/null and b/samples/grouped-by-generator/RichDocument_Generated_by_Libreoffice-6.4_PDF-v1.4.pdf differ diff --git a/samples/grouped-by-generator/SimpleImage_Generated_by_Inkscape-0.92_PDF-v1.4.pdf b/samples/grouped-by-generator/SimpleImage_Generated_by_Inkscape-0.92_PDF-v1.4.pdf new file mode 100644 index 00000000..f71ceeac Binary files /dev/null and b/samples/grouped-by-generator/SimpleImage_Generated_by_Inkscape-0.92_PDF-v1.4.pdf differ diff --git a/samples/grouped-by-generator/SimpleImage_Generated_by_Inkscape-0.92_PDF-v1.5.pdf b/samples/grouped-by-generator/SimpleImage_Generated_by_Inkscape-0.92_PDF-v1.5.pdf new file mode 100644 index 00000000..287b9476 Binary files /dev/null and b/samples/grouped-by-generator/SimpleImage_Generated_by_Inkscape-0.92_PDF-v1.5.pdf differ diff --git a/samples/grouped-by-generator/Wikipedia-PDF_Generated_by_Microsoft_Print-to-PDF.pdf b/samples/grouped-by-generator/Wikipedia-PDF_Generated_by_Microsoft_Print-to-PDF.pdf new file mode 100644 index 00000000..d0de53c4 Binary files /dev/null and b/samples/grouped-by-generator/Wikipedia-PDF_Generated_by_Microsoft_Print-to-PDF.pdf differ diff --git a/samples/not_really_encrypted.pdf b/samples/not_really_encrypted.pdf new file mode 100644 index 00000000..fe841fe8 Binary files /dev/null and b/samples/not_really_encrypted.pdf differ diff --git a/src/Smalot/PdfParser/Config.php b/src/Smalot/PdfParser/Config.php new file mode 100644 index 00000000..e44b1640 --- /dev/null +++ b/src/Smalot/PdfParser/Config.php @@ -0,0 +1,175 @@ + + * + * @date 2020-11-22 + * + * @license LGPLv3 + * + * @url + * + * PdfParser is a pdf library written in PHP, extraction oriented. + * Copyright (C) 2017 - Sébastien MALOT + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program. + * If not, see . + */ + +namespace Smalot\PdfParser; + +/** + * This class contains configurations used in various classes. You can override them + * manually, in case default values aren't working. + * + * @see https://github.com/smalot/pdfparser/issues/305 + */ +class Config +{ + private $fontSpaceLimit = -50; + + /** + * @var string + */ + private $horizontalOffset = ' '; + + /** + * Represents: (NUL, HT, LF, FF, CR, SP) + * + * @var string + */ + private $pdfWhitespaces = "\0\t\n\f\r "; + + /** + * Represents: (NUL, HT, LF, FF, CR, SP) + * + * @var string + */ + private $pdfWhitespacesRegex = '[\0\t\n\f\r ]'; + + /** + * Whether to retain raw image data as content or discard it to save memory + * + * @var bool + */ + private $retainImageContent = true; + + /** + * Memory limit to use when de-compressing files, in bytes. + * + * @var int + */ + private $decodeMemoryLimit = 0; + + /** + * Whether to include font id and size in dataTm array + * + * @var bool + */ + private $dataTmFontInfoHasToBeIncluded = false; + + /** + * Whether to attempt to read PDFs even if they are marked as encrypted. + * + * @var bool + */ + private $ignoreEncryption = false; + + public function getFontSpaceLimit() + { + return $this->fontSpaceLimit; + } + + public function setFontSpaceLimit($value) + { + $this->fontSpaceLimit = $value; + } + + public function getHorizontalOffset(): string + { + return $this->horizontalOffset; + } + + public function setHorizontalOffset($value): void + { + $this->horizontalOffset = $value; + } + + public function getPdfWhitespaces(): string + { + return $this->pdfWhitespaces; + } + + public function setPdfWhitespaces(string $pdfWhitespaces): void + { + $this->pdfWhitespaces = $pdfWhitespaces; + } + + public function getPdfWhitespacesRegex(): string + { + return $this->pdfWhitespacesRegex; + } + + public function setPdfWhitespacesRegex(string $pdfWhitespacesRegex): void + { + $this->pdfWhitespacesRegex = $pdfWhitespacesRegex; + } + + public function getRetainImageContent(): bool + { + return $this->retainImageContent; + } + + public function setRetainImageContent(bool $retainImageContent): void + { + $this->retainImageContent = $retainImageContent; + } + + public function getDecodeMemoryLimit(): int + { + return $this->decodeMemoryLimit; + } + + public function setDecodeMemoryLimit(int $decodeMemoryLimit): void + { + $this->decodeMemoryLimit = $decodeMemoryLimit; + } + + public function getDataTmFontInfoHasToBeIncluded(): bool + { + return $this->dataTmFontInfoHasToBeIncluded; + } + + public function setDataTmFontInfoHasToBeIncluded(bool $dataTmFontInfoHasToBeIncluded): void + { + $this->dataTmFontInfoHasToBeIncluded = $dataTmFontInfoHasToBeIncluded; + } + + public function getIgnoreEncryption(): bool + { + return $this->ignoreEncryption; + } + + /** + * @deprecated this is a temporary workaround, don't rely on it + * @see https://github.com/smalot/pdfparser/pull/653 + */ + public function setIgnoreEncryption(bool $ignoreEncryption): void + { + $this->ignoreEncryption = $ignoreEncryption; + } +} diff --git a/src/Smalot/PdfParser/Document.php b/src/Smalot/PdfParser/Document.php index b5164f78..1fad8b1b 100644 --- a/src/Smalot/PdfParser/Document.php +++ b/src/Smalot/PdfParser/Document.php @@ -5,32 +5,35 @@ * This file is part of the PdfParser library. * * @author Sébastien MALOT - * @date 2013-08-08 - * @license GPL-3.0 + * + * @date 2017-01-03 + * + * @license LGPLv3 + * * @url * * PdfParser is a pdf library written in PHP, extraction oriented. - * Copyright (C) 2014 - Sébastien MALOT + * Copyright (C) 2017 - Sébastien MALOT * * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by + * it under the terms of the GNU Lesser General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. + * GNU Lesser General Public License for more details. * - * You should have received a copy of the GNU General Public License + * You should have received a copy of the GNU Lesser General Public License * along with this program. * If not, see . - * */ namespace Smalot\PdfParser; -use Smalot\PdfParser\Element\ElementDate; +use Smalot\PdfParser\Encoding\PDFDocEncoding; +use Smalot\PdfParser\Exception\MissingCatalogException; /** * Technical references : @@ -43,42 +46,39 @@ * - http://cpansearch.perl.org/src/JV/PostScript-Font-1.10.02/lib/PostScript/WinAnsiEncoding.pm * * Class Document - * - * @package Smalot\PdfParser */ class Document { /** - * @var Object[] + * @var PDFObject[] */ - protected $objects = array(); + protected $objects = []; /** * @var array */ - protected $dictionary = array(); + protected $dictionary = []; /** * @var Header */ - protected $trailer = null; + protected $trailer; /** - * @var array + * @var array */ - protected $details = null; + protected $metadata = []; /** - * + * @var array */ + protected $details; + public function __construct() { - $this->trailer = new Header(array(), $this); + $this->trailer = new Header([], $this); } - /** - * - */ public function init() { $this->buildDictionary(); @@ -87,6 +87,7 @@ public function init() // Propagate init to objects. foreach ($this->objects as $object) { + $object->getHeader()->init(); $object->init(); } } @@ -97,13 +98,29 @@ public function init() protected function buildDictionary() { // Build dictionary. - $this->dictionary = array(); + $this->dictionary = []; foreach ($this->objects as $id => $object) { + // Cache objects by type and subtype $type = $object->getHeader()->get('Type')->getContent(); - if (!empty($type)) { - $this->dictionary[$type][$id] = $id; + if (null != $type) { + if (!isset($this->dictionary[$type])) { + $this->dictionary[$type] = [ + 'all' => [], + 'subtype' => [], + ]; + } + + $this->dictionary[$type]['all'][$id] = $object; + + $subtype = $object->getHeader()->get('Subtype')->getContent(); + if (null != $subtype) { + if (!isset($this->dictionary[$type]['subtype'][$subtype])) { + $this->dictionary[$type]['subtype'][$subtype] = []; + } + $this->dictionary[$type]['subtype'][$subtype][$id] = $object; + } } } } @@ -114,46 +131,203 @@ protected function buildDictionary() protected function buildDetails() { // Build details array. - $details = array(); + $details = []; // Extract document info if ($this->trailer->has('Info')) { - /** @var Object $info */ - $info = $this->trailer->get('Info'); - $details = $info->getHeader()->getDetails(); + /** @var PDFObject $info */ + $info = $this->trailer->get('Info'); + // This could be an ElementMissing object, so we need to check for + // the getHeader method first. + if (null !== $info && method_exists($info, 'getHeader')) { + $details = $info->getHeader()->getDetails(); + } } // Retrieve the page count try { - $pages = $this->getPages(); - $details['Pages'] = count($pages); + $pages = $this->getPages(); + $details['Pages'] = \count($pages); } catch (\Exception $e) { $details['Pages'] = 0; } + // Decode and repair encoded document properties + foreach ($details as $key => $value) { + if (\is_string($value)) { + // If the string is already UTF-8 encoded, that means we only + // need to repair Adobe's ham-fisted insertion of line-feeds + // every ~127 characters, which doesn't seem to be multi-byte + // safe + if (mb_check_encoding($value, 'UTF-8')) { + // Remove literal backslash + line-feed "\\r" + $value = str_replace("\x5c\x0d", '', $value); + + // Remove backslash plus bytes written into high part of + // multibyte unicode character + while (preg_match("/\x5c\x5c\xe0([\xb4-\xb8])(.)/", $value, $match)) { + $diff = (\ord($match[1]) - 182) * 64; + $newbyte = PDFDocEncoding::convertPDFDoc2UTF8(\chr(\ord($match[2]) + $diff)); + $value = preg_replace("/\x5c\x5c\xe0".$match[1].$match[2].'/', $newbyte, $value); + } + + // Remove bytes written into low part of multibyte unicode + // character + while (preg_match("/(.)\x9c\xe0([\xb3-\xb7])/", $value, $match)) { + $diff = \ord($match[2]) - 181; + $newbyte = \chr(\ord($match[1]) + $diff); + $value = preg_replace('/'.$match[1]."\x9c\xe0".$match[2].'/', $newbyte, $value); + } + + // Remove this byte string that Adobe occasionally adds + // between two single byte characters in a unicode string + $value = str_replace("\xe5\xb0\x8d", '', $value); + + $details[$key] = $value; + } else { + // If the string is just PDFDocEncoding, remove any line-feeds + // and decode the whole thing. + $value = str_replace("\\\r", '', $value); + $details[$key] = PDFDocEncoding::convertPDFDoc2UTF8($value); + } + } + } + + $details = array_merge($details, $this->metadata); + $this->details = $details; } /** - * @return array + * Extract XMP Metadata */ - public function getDictionary() + public function extractXMPMetadata(string $content): void + { + $xml = xml_parser_create(); + xml_parser_set_option($xml, \XML_OPTION_SKIP_WHITE, 1); + + if (1 === xml_parse_into_struct($xml, $content, $values, $index)) { + /* + * short overview about the following code parts: + * + * The output of xml_parse_into_struct is a single dimensional array (= $values), and the $stack is a last-on, + * first-off array of pointers to positions in $metadata, while iterating through it, that potentially turn the + * results into a more intuitive multi-dimensional array. When an "open" XML tag is encountered, + * we save the current $metadata context in the $stack, then create a child array of $metadata and + * make that the current $metadata context. When a "close" XML tag is encountered, the operations are + * reversed: the most recently added $metadata context from $stack (IOW, the parent of the current + * element) is set as the current $metadata context. + */ + $metadata = []; + $stack = []; + foreach ($values as $val) { + // Standardize to lowercase + $val['tag'] = strtolower($val['tag']); + + // Ignore structural x: and rdf: XML elements + if (0 === strpos($val['tag'], 'x:')) { + continue; + } elseif (0 === strpos($val['tag'], 'rdf:') && 'rdf:li' != $val['tag']) { + continue; + } + + switch ($val['type']) { + case 'open': + // Create an array of list items + if ('rdf:li' == $val['tag']) { + $metadata[] = []; + + // Move up one level in the stack + $stack[\count($stack)] = &$metadata; + $metadata = &$metadata[\count($metadata) - 1]; + } else { + // Else create an array of named values + $metadata[$val['tag']] = []; + + // Move up one level in the stack + $stack[\count($stack)] = &$metadata; + $metadata = &$metadata[$val['tag']]; + } + break; + + case 'complete': + if (isset($val['value'])) { + // Assign a value to this list item + if ('rdf:li' == $val['tag']) { + $metadata[] = $val['value']; + + // Else assign a value to this property + } else { + $metadata[$val['tag']] = $val['value']; + } + } + break; + + case 'close': + // If the value of this property is an array + if (\is_array($metadata)) { + // If the value is a single element array + // where the element is of type string, use + // the value of the first list item as the + // value for this property + if (1 == \count($metadata) && isset($metadata[0]) && \is_string($metadata[0])) { + $metadata = $metadata[0]; + } elseif (0 == \count($metadata)) { + // if the value is an empty array, set + // the value of this property to the empty + // string + $metadata = ''; + } + } + + // Move down one level in the stack + $metadata = &$stack[\count($stack) - 1]; + unset($stack[\count($stack) - 1]); + break; + } + } + + // Only use this metadata if it's referring to a PDF + if (!isset($metadata['dc:format']) || 'application/pdf' == $metadata['dc:format']) { + // According to the XMP specifications: 'Conflict resolution + // for separate packets that describe the same resource is + // beyond the scope of this document.' - Section 6.1 + // Source: https://www.adobe.com/devnet/xmp.html + // Source: https://github.com/adobe/XMP-Toolkit-SDK/blob/main/docs/XMPSpecificationPart1.pdf + // So if there are multiple XMP blocks, just merge the values + // of each found block over top of the existing values + $this->metadata = array_merge($this->metadata, $metadata); + } + } + + // TODO: remove this if-clause and its content when dropping PHP 7 support + if (version_compare(PHP_VERSION, '8.0.0', '<')) { + // ref: https://www.php.net/manual/en/function.xml-parser-free.php + xml_parser_free($xml); + + // to avoid memory leaks; documentation said: + // > it was necessary to also explicitly unset the reference to parser to avoid memory leaks + unset($xml); + } + } + + public function getDictionary(): array { return $this->dictionary; } /** - * @param Object[] $objects + * @param PDFObject[] $objects */ - public function setObjects($objects = array()) + public function setObjects($objects = []) { - $this->objects = (array)$objects; + $this->objects = (array) $objects; $this->init(); } /** - * @return Object[] + * @return PDFObject[] */ public function getObjects() { @@ -161,66 +335,79 @@ public function getObjects() } /** - * @param string $id - * - * @return Object + * @return PDFObject|Font|Page|Element|null */ - public function getObjectById($id) + public function getObjectById(string $id) { if (isset($this->objects[$id])) { return $this->objects[$id]; - } else { - return null; } + + return null; } - /** - * @param string $type - * @param string $subtype - * - * @return Object[] - */ - public function getObjectsByType($type, $subtype = null) + public function hasObjectsByType(string $type, ?string $subtype = null): bool { - $objects = array(); + return 0 < \count($this->getObjectsByType($type, $subtype)); + } - foreach ($this->objects as $id => $object) { - if ($object->getHeader()->get('Type') == $type && - (is_null($subtype) || $object->getHeader()->get('Subtype') == $subtype) - ) { - $objects[$id] = $object; + public function getObjectsByType(string $type, ?string $subtype = null): array + { + if (!isset($this->dictionary[$type])) { + return []; + } + + if (null != $subtype) { + if (!isset($this->dictionary[$type]['subtype'][$subtype])) { + return []; } + + return $this->dictionary[$type]['subtype'][$subtype]; } - return $objects; + return $this->dictionary[$type]['all']; } /** - * @return \Object[] + * @return Font[] */ public function getFonts() { return $this->getObjectsByType('Font'); } + public function getFirstFont(): ?Font + { + $fonts = $this->getFonts(); + if ([] === $fonts) { + return null; + } + + return reset($fonts); + } + /** * @return Page[] - * @throws \Exception + * + * @throws MissingCatalogException */ public function getPages() { - if (isset($this->dictionary['Catalog'])) { + if ($this->hasObjectsByType('Catalog')) { // Search for catalog to list pages. - $id = reset($this->dictionary['Catalog']); + $catalogues = $this->getObjectsByType('Catalog'); + $catalogue = reset($catalogues); /** @var Pages $object */ - $object = $this->objects[$id]->get('Pages'); - $pages = $object->getPages(true); + $object = $catalogue->get('Pages'); + if (method_exists($object, 'getPages')) { + return $object->getPages(true); + } + } - return $pages; - } elseif (isset($this->dictionary['Pages'])) { + if ($this->hasObjectsByType('Pages')) { // Search for pages to list kids. - $pages = array(); + $pages = []; /** @var Pages[] $objects */ $objects = $this->getObjectsByType('Pages'); @@ -229,27 +416,35 @@ public function getPages() } return $pages; - } elseif (isset($this->dictionary['Page'])) { + } + + if ($this->hasObjectsByType('Page')) { // Search for 'page' (unordered pages). $pages = $this->getObjectsByType('Page'); return array_values($pages); - } else { - throw new \Exception('Missing catalog.'); } + + throw new MissingCatalogException('Missing catalog.'); } - /** - * @param Page $page - * - * @return string - */ - public function getText(Page $page = null) + public function getText(?int $pageLimit = null): string { - $texts = array(); + $texts = []; $pages = $this->getPages(); + // Only use the first X number of pages if $pageLimit is set and numeric. + if (\is_int($pageLimit) && 0 < $pageLimit) { + $pages = \array_slice($pages, 0, $pageLimit); + } + foreach ($pages as $index => $page) { + /** + * In some cases, the $page variable may be null. + */ + if (null === $page) { + continue; + } if ($text = trim($page->getText())) { $texts[] = $text; } @@ -258,18 +453,17 @@ public function getText(Page $page = null) return implode("\n\n", $texts); } - /** - * @param Header $header - */ + public function getTrailer(): Header + { + return $this->trailer; + } + public function setTrailer(Header $trailer) { $this->trailer = $trailer; } - /** - * @return array - */ - public function getDetails($deep = true) + public function getDetails(): array { return $this->details; } diff --git a/src/Smalot/PdfParser/Element.php b/src/Smalot/PdfParser/Element.php index 3b2a5456..80660303 100644 --- a/src/Smalot/PdfParser/Element.php +++ b/src/Smalot/PdfParser/Element.php @@ -5,27 +5,29 @@ * This file is part of the PdfParser library. * * @author Sébastien MALOT - * @date 2013-08-08 - * @license GPL-3.0 + * + * @date 2017-01-03 + * + * @license LGPLv3 + * * @url * * PdfParser is a pdf library written in PHP, extraction oriented. - * Copyright (C) 2014 - Sébastien MALOT + * Copyright (C) 2017 - Sébastien MALOT * * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by + * it under the terms of the GNU Lesser General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. + * GNU Lesser General Public License for more details. * - * You should have received a copy of the GNU General Public License + * You should have received a copy of the GNU Lesser General Public License * along with this program. * If not, see . - * */ namespace Smalot\PdfParser; @@ -43,57 +45,34 @@ /** * Class Element - * - * @package Smalot\PdfParser */ class Element { /** - * @var Document + * @var Document|null */ - protected $document = null; + protected $document; - /** - * @var mixed - */ - protected $value = null; + protected $value; - /** - * @param mixed $value - * @param Document $document - */ - public function __construct($value, Document $document = null) + public function __construct($value, ?Document $document = null) { - $this->value = $value; + $this->value = $value; $this->document = $document; } - /** - * - */ public function init() { - } - /** - * @param mixed $value - * - * @return bool - */ - public function equals($value) + public function equals($value): bool { - return ($value == $this->value); + return $value == $this->value; } - /** - * @param mixed $value - * - * @return bool - */ - public function contains($value) + public function contains($value): bool { - if (is_array($this->value)) { + if (\is_array($this->value)) { /** @var Element $val */ foreach ($this->value as $val) { if ($val->equals($value)) { @@ -102,55 +81,47 @@ public function contains($value) } return false; - } else { - return $this->equals($value); } + + return $this->equals($value); } - /** - * @return mixed - */ public function getContent() { return $this->value; } - /** - * @return string - */ - public function __toString() + public function __toString(): string { - return (string)($this->value); + return (string) $this->value; } - /** - * @param string $content - * @param Document $document - * @param int $position - * - * @return array - * @throws \Exception - */ - public static function parse($content, Document $document = null, &$position = 0) + public static function parse(string $content, ?Document $document = null, int &$position = 0) { - $args = func_get_args(); + $args = \func_get_args(); $only_values = isset($args[3]) ? $args[3] : false; - $content = trim($content); - $values = array(); + $content = trim($content); + $values = []; do { $old_position = $position; if (!$only_values) { - if (!preg_match('/^\s*(?P\/[A-Z0-9\._]+)(?P.*)/si', substr($content, $position), $match)) { + if (!preg_match('/\G\s*(?P\/[A-Z#0-9\._]+)(?P.*)/si', $content, $match, 0, $position)) { break; } else { - $name = ltrim($match['name'], '/'); - $value = $match['value']; - $position = strpos($content, $value, $position + strlen($match['name'])); + $name = preg_replace_callback( + '/#([0-9a-f]{2})/i', + function ($m) { + return \chr(base_convert($m[1], 16, 10)); + }, + ltrim($match['name'], '/') + ); + $value = $match['value']; + $position = strpos($content, $value, $position + \strlen($match['name'])); } } else { - $name = count($values); + $name = \count($values); $value = substr($content, $position); } @@ -178,7 +149,7 @@ public static function parse($content, Document $document = null, &$position = 0 $position = $old_position; break; } - } while ($position < strlen($content)); + } while ($position < \strlen($content)); return $values; } diff --git a/src/Smalot/PdfParser/Element/ElementArray.php b/src/Smalot/PdfParser/Element/ElementArray.php index 7f2a045a..b54bf843 100644 --- a/src/Smalot/PdfParser/Element/ElementArray.php +++ b/src/Smalot/PdfParser/Element/ElementArray.php @@ -5,55 +5,48 @@ * This file is part of the PdfParser library. * * @author Sébastien MALOT - * @date 2013-08-08 - * @license GPL-3.0 + * + * @date 2017-01-03 + * + * @license LGPLv3 + * * @url * * PdfParser is a pdf library written in PHP, extraction oriented. - * Copyright (C) 2014 - Sébastien MALOT + * Copyright (C) 2017 - Sébastien MALOT * * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by + * it under the terms of the GNU Lesser General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. + * GNU Lesser General Public License for more details. * - * You should have received a copy of the GNU General Public License + * You should have received a copy of the GNU Lesser General Public License * along with this program. * If not, see . - * */ namespace Smalot\PdfParser\Element; -use Smalot\PdfParser\Element; use Smalot\PdfParser\Document; +use Smalot\PdfParser\Element; use Smalot\PdfParser\Header; -use Smalot\PdfParser\Object; +use Smalot\PdfParser\PDFObject; /** * Class ElementArray - * - * @package Smalot\PdfParser\Element */ class ElementArray extends Element { - /** - * @param string $value - * @param Document $document - */ - public function __construct($value, Document $document = null) + public function __construct($value, ?Document $document = null) { parent::__construct($value, $document); } - /** - * @return mixed - */ public function getContent() { foreach ($this->value as $name => $element) { @@ -63,34 +56,26 @@ public function getContent() return parent::getContent(); } - /** - * @return array - */ - public function getRawContent() + public function getRawContent(): array { return $this->value; } - /** - * @param bool $deep - * - * @return array - */ - public function getDetails($deep = true) + public function getDetails(bool $deep = true): array { - $values = array(); + $values = []; $elements = $this->getContent(); foreach ($elements as $key => $element) { if ($element instanceof Header && $deep) { $values[$key] = $element->getDetails($deep); - } elseif ($element instanceof Object && $deep) { + } elseif ($element instanceof PDFObject && $deep) { $values[$key] = $element->getDetails(false); - } elseif ($element instanceof ElementArray) { + } elseif ($element instanceof self) { if ($deep) { $values[$key] = $element->getDetails(); } - } elseif ($element instanceof Element && !($element instanceof ElementArray)) { + } elseif ($element instanceof Element && !($element instanceof self)) { $values[$key] = $element->getContent(); } } @@ -98,24 +83,19 @@ public function getDetails($deep = true) return $values; } - /** - * @return string - */ - public function __toString() + public function __toString(): string { return implode(',', $this->value); } /** - * @param string $name - * - * @return Element|Object + * @return Element|PDFObject */ - protected function resolveXRef($name) + protected function resolveXRef(string $name) { if (($obj = $this->value[$name]) instanceof ElementXRef) { - /** @var Object $obj */ - $obj = $this->document->getObjectById($obj->getId()); + /** @var ElementXRef $obj */ + $obj = $this->document->getObjectById($obj->getId()); $this->value[$name] = $obj; } @@ -123,35 +103,33 @@ protected function resolveXRef($name) } /** - * @param string $content - * @param Document $document - * @param int $offset + * @todo: These methods return mixed and mismatched types throughout the hierarchy * * @return bool|ElementArray */ - public static function parse($content, Document $document = null, &$offset = 0) + public static function parse(string $content, ?Document $document = null, int &$offset = 0) { if (preg_match('/^\s*\[(?P.*)/is', $content, $match)) { preg_match_all('/(.*?)(\[|\])/s', trim($content), $matches); $level = 0; - $sub = ''; + $sub = ''; foreach ($matches[0] as $part) { $sub .= $part; - $level += (strpos($part, '[') !== false ? 1 : -1); + $level += (false !== strpos($part, '[') ? 1 : -1); if ($level <= 0) { break; } } // Removes 1 level [ and ]. - $sub = substr(trim($sub), 1, -1); + $sub = substr(trim($sub), 1, -1); $sub_offset = 0; - $values = Element::parse($sub, $document, $sub_offset, true); + $values = Element::parse($sub, $document, $sub_offset, true); $offset += strpos($content, '[') + 1; // Find next ']' position - $offset += strlen($sub) + 1; + $offset += \strlen($sub) + 1; return new self($values, $document); } diff --git a/src/Smalot/PdfParser/Element/ElementBoolean.php b/src/Smalot/PdfParser/Element/ElementBoolean.php index 5bb2249e..55fb4638 100644 --- a/src/Smalot/PdfParser/Element/ElementBoolean.php +++ b/src/Smalot/PdfParser/Element/ElementBoolean.php @@ -5,82 +5,69 @@ * This file is part of the PdfParser library. * * @author Sébastien MALOT - * @date 2013-08-08 - * @license GPL-3.0 + * + * @date 2017-01-03 + * + * @license LGPLv3 + * * @url * * PdfParser is a pdf library written in PHP, extraction oriented. - * Copyright (C) 2014 - Sébastien MALOT + * Copyright (C) 2017 - Sébastien MALOT * * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by + * it under the terms of the GNU Lesser General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. + * GNU Lesser General Public License for more details. * - * You should have received a copy of the GNU General Public License + * You should have received a copy of the GNU Lesser General Public License * along with this program. * If not, see . - * */ namespace Smalot\PdfParser\Element; -use Smalot\PdfParser\Element; use Smalot\PdfParser\Document; +use Smalot\PdfParser\Element; /** * Class ElementBoolean - * - * @package Smalot\PdfParser\Element */ class ElementBoolean extends Element { /** - * @param string $value - * @param Document $document + * @param string|bool $value */ - public function __construct($value, Document $document = null) + public function __construct($value) { - parent::__construct((strtolower($value) == 'true' || $value === true), null); + parent::__construct('true' == strtolower($value) || true === $value, null); } - /** - * @return string - */ - public function __toString() + public function __toString(): string { return $this->value ? 'true' : 'false'; } - /** - * @param mixed $value - * - * @return bool - */ - public function equals($value) + public function equals($value): bool { - return ($this->getContent() === $value); + return $this->getContent() === $value; } /** - * @param string $content - * @param Document $document - * @param int $offset - * * @return bool|ElementBoolean */ - public static function parse($content, Document $document = null, &$offset = 0) + public static function parse(string $content, ?Document $document = null, int &$offset = 0) { if (preg_match('/^\s*(?Ptrue|false)/is', $content, $match)) { - $value = $match['value']; - $offset += strpos($content, $value) + strlen($value); + $value = $match['value']; + $offset += strpos($content, $value) + \strlen($value); - return new self($value, $document); + return new self($value); } return false; diff --git a/src/Smalot/PdfParser/Element/ElementDate.php b/src/Smalot/PdfParser/Element/ElementDate.php index 008be16a..f1f2df6f 100644 --- a/src/Smalot/PdfParser/Element/ElementDate.php +++ b/src/Smalot/PdfParser/Element/ElementDate.php @@ -5,48 +5,47 @@ * This file is part of the PdfParser library. * * @author Sébastien MALOT - * @date 2013-08-08 - * @license GPL-3.0 + * + * @date 2017-01-03 + * + * @license LGPLv3 + * * @url * * PdfParser is a pdf library written in PHPi, extraction oriented. - * Copyright (C) 2014 - Sébastien MALOT + * Copyright (C) 2017 - Sébastien MALOT * * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by + * it under the terms of the GNU Lesser General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. + * GNU Lesser General Public License for more details. * - * You should have received a copy of the GNU General Public License + * You should have received a copy of the GNU Lesser General Public License * along with this program. * If not, see . - * */ namespace Smalot\PdfParser\Element; -use Smalot\PdfParser\Element; use Smalot\PdfParser\Document; /** * Class ElementDate - * - * @package Smalot\PdfParser\Element */ class ElementDate extends ElementString { /** - * @var array + * @var array */ - protected static $formats = array( - 4 => 'Y', - 6 => 'Ym', - 8 => 'Ymd', + protected static $formats = [ + 4 => 'Y', + 6 => 'Ym', + 8 => 'Ymd', 10 => 'YmdH', 12 => 'YmdHi', 14 => 'YmdHis', @@ -54,7 +53,7 @@ class ElementDate extends ElementString 17 => 'YmdHisO', 18 => 'YmdHisO', 19 => 'YmdHisO', - ); + ]; /** * @var string @@ -62,32 +61,25 @@ class ElementDate extends ElementString protected $format = 'c'; /** - * @param \DateTime $value - * @param Document $document + * @var \DateTime */ - public function __construct($value, Document $document = null) + protected $value; + + public function __construct($value) { if (!($value instanceof \DateTime)) { - throw new \Exception('DateTime required.'); + throw new \Exception('DateTime required.'); // FIXME: Sometimes strings are passed to this function } - parent::__construct($value, null); + parent::__construct($value); } - /** - * @param string $format - */ - public function setFormat($format) + public function setFormat(string $format) { $this->format = $format; } - /** - * @param mixed $value - * - * @return bool - */ - public function equals($value) + public function equals($value): bool { if ($value instanceof \DateTime) { $timestamp = $value->getTimeStamp(); @@ -95,25 +87,18 @@ public function equals($value) $timestamp = strtotime($value); } - return ($timestamp == $this->value->getTimeStamp()); + return $timestamp == $this->value->getTimeStamp(); } - /** - * @return string - */ - public function __toString() + public function __toString(): string { - return (string)($this->value->format($this->format)); + return (string) $this->value->format($this->format); } /** - * @param string $content - * @param Document $document - * @param int $offset - * * @return bool|ElementDate */ - public static function parse($content, Document $document = null, &$offset = 0) + public static function parse(string $content, ?Document $document = null, int &$offset = 0) { if (preg_match('/^\s*\(D\:(?P.*?)\)/s', $content, $match)) { $name = $match['name']; @@ -125,18 +110,18 @@ public static function parse($content, Document $document = null, &$offset = 0) if (preg_match('/^\d{4}(\d{2}(\d{2}(\d{2}(\d{2}(\d{2}(Z(\d{2,4})?|[\+-]?\d{2}(\d{2})?)?)?)?)?)?)?$/', $name)) { if ($pos = strpos($name, 'Z')) { $name = substr($name, 0, $pos + 1); - } elseif (strlen($name) == 18 && preg_match('/[^\+-]0000$/', $name)) { - $name = substr($name, 0, -4) . '+0000'; + } elseif (18 == \strlen($name) && preg_match('/[^\+-]0000$/', $name)) { + $name = substr($name, 0, -4).'+0000'; } - $format = self::$formats[strlen($name)]; - $date = \DateTime::createFromFormat($format, $name); + $format = self::$formats[\strlen($name)]; + $date = \DateTime::createFromFormat($format, $name, new \DateTimeZone('UTC')); } else { // special cases if (preg_match('/^\d{1,2}-\d{1,2}-\d{4},?\s+\d{2}:\d{2}:\d{2}[\+-]\d{4}$/', $name)) { - $name = str_replace(',', '', $name); + $name = str_replace(',', '', $name); $format = 'n-j-Y H:i:sO'; - $date = \DateTime::createFromFormat($format, $name); + $date = \DateTime::createFromFormat($format, $name, new \DateTimeZone('UTC')); } } @@ -144,10 +129,9 @@ public static function parse($content, Document $document = null, &$offset = 0) return false; } - $offset += strpos($content, '(D:') + strlen($match['name']) + 4; // 1 for '(D:' and ')' - $element = new self($date, $document); + $offset += strpos($content, '(D:') + \strlen($match['name']) + 4; // 1 for '(D:' and ')' - return $element; + return new self($date); } return false; diff --git a/src/Smalot/PdfParser/Element/ElementHexa.php b/src/Smalot/PdfParser/Element/ElementHexa.php index 95c0e92e..3fc34136 100644 --- a/src/Smalot/PdfParser/Element/ElementHexa.php +++ b/src/Smalot/PdfParser/Element/ElementHexa.php @@ -5,27 +5,29 @@ * This file is part of the PdfParser library. * * @author Sébastien MALOT - * @date 2013-08-08 - * @license GPL-3.0 + * + * @date 2017-01-03 + * + * @license LGPLv3 + * * @url * * PdfParser is a pdf library written in PHP, extraction oriented. - * Copyright (C) 2014 - Sébastien MALOT + * Copyright (C) 2017 - Sébastien MALOT * * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by + * it under the terms of the GNU Lesser General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. + * GNU Lesser General Public License for more details. * - * You should have received a copy of the GNU General Public License + * You should have received a copy of the GNU Lesser General Public License * along with this program. * If not, see . - * */ namespace Smalot\PdfParser\Element; @@ -34,28 +36,22 @@ /** * Class ElementHexa - * - * @package Smalot\PdfParser\Element */ class ElementHexa extends ElementString { /** - * @param string $content - * @param Document $document - * @param int $offset - * - * @return bool|ElementHexa + * @return bool|ElementHexa|ElementDate */ - public static function parse($content, Document $document = null, &$offset = 0) + public static function parse(string $content, ?Document $document = null, int &$offset = 0) { if (preg_match('/^\s*\<(?P[A-F0-9]+)\>/is', $content, $match)) { - $name = $match['name']; - $offset += strpos($content, '<' . $name) + strlen($name) + 2; // 1 for '>' + $name = $match['name']; + $offset += strpos($content, '<'.$name) + \strlen($name) + 2; // 1 for '>' // repackage string as standard - $name = '(' . self::decode($name, $document) . ')'; - $element = false; + $name = '('.self::decode($name).')'; + $element = ElementDate::parse($name, $document); - if (!($element = ElementDate::parse($name, $document))) { + if (!$element) { $element = ElementString::parse($name, $document); } @@ -65,28 +61,30 @@ public static function parse($content, Document $document = null, &$offset = 0) return false; } - /** - * @param string $value - * @param Document $document - */ - public static function decode($value, Document $document = null) + public static function decode(string $value): string { - $text = ''; - $length = strlen($value); + $text = ''; - if (substr($value, 0, 2) == '00') { - for ($i = 0; $i < $length; $i += 4) { + // Filter $value of non-hexadecimal characters + $value = (string) preg_replace('/[^0-9a-f]/i', '', $value); + + // Check for leading zeros (4-byte hexadecimal indicator), or + // the BE BOM + if ('00' === substr($value, 0, 2) || 'feff' === strtolower(substr($value, 0, 4))) { + $value = (string) preg_replace('/^feff/i', '', $value); + for ($i = 0, $length = \strlen($value); $i < $length; $i += 4) { $hex = substr($value, $i, 4); - $text .= '&#' . str_pad(hexdec($hex), 4, '0', STR_PAD_LEFT) . ';'; + $text .= '&#'.str_pad(hexdec($hex), 4, '0', \STR_PAD_LEFT).';'; } } else { - for ($i = 0; $i < $length; $i += 2) { + // Otherwise decode this as 2-byte hexadecimal + for ($i = 0, $length = \strlen($value); $i < $length; $i += 2) { $hex = substr($value, $i, 2); - $text .= chr(hexdec($hex)); + $text .= \chr(hexdec($hex)); } } - - $text = html_entity_decode($text, ENT_NOQUOTES, 'UTF-8'); + + $text = html_entity_decode($text, \ENT_NOQUOTES, 'UTF-8'); return $text; } diff --git a/src/Smalot/PdfParser/Element/ElementMissing.php b/src/Smalot/PdfParser/Element/ElementMissing.php index e6bec9a1..d2fc0008 100644 --- a/src/Smalot/PdfParser/Element/ElementMissing.php +++ b/src/Smalot/PdfParser/Element/ElementMissing.php @@ -5,80 +5,61 @@ * This file is part of the PdfParser library. * * @author Sébastien MALOT - * @date 2013-08-08 - * @license GPL-3.0 + * + * @date 2017-01-03 + * + * @license LGPLv3 + * * @url * * PdfParser is a pdf library written in PHP, extraction oriented. - * Copyright (C) 2014 - Sébastien MALOT + * Copyright (C) 2017 - Sébastien MALOT * * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by + * it under the terms of the GNU Lesser General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. + * GNU Lesser General Public License for more details. * - * You should have received a copy of the GNU General Public License + * You should have received a copy of the GNU Lesser General Public License * along with this program. * If not, see . - * */ namespace Smalot\PdfParser\Element; use Smalot\PdfParser\Element; -use Smalot\PdfParser\Document; /** * Class ElementMissing */ class ElementMissing extends Element { - /** - * @param string $value - * @param Document $document - */ - public function __construct($value, Document $document = null) + public function __construct() { parent::__construct(null, null); } - /** - * @param mixed $value - * - * @return bool - */ - public function equals($value) + public function equals($value): bool { return false; } - /** - * @param mixed $value - * - * @return bool - */ - public function contains($value) + public function contains($value): bool { return false; } - /** - * @return bool - */ - public function getContent() + public function getContent(): bool { return false; } - /** - * @return string - */ - public function __toString() + public function __toString(): string { return ''; } diff --git a/src/Smalot/PdfParser/Element/ElementName.php b/src/Smalot/PdfParser/Element/ElementName.php index d9051c7b..6e8d97ac 100644 --- a/src/Smalot/PdfParser/Element/ElementName.php +++ b/src/Smalot/PdfParser/Element/ElementName.php @@ -5,76 +5,63 @@ * This file is part of the PdfParser library. * * @author Sébastien MALOT - * @date 2013-08-08 - * @license GPL-3.0 + * + * @date 2017-01-03 + * + * @license LGPLv3 + * * @url * * PdfParser is a pdf library written in PHP, extraction oriented. - * Copyright (C) 2014 - Sébastien MALOT + * Copyright (C) 2017 - Sébastien MALOT * * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by + * it under the terms of the GNU Lesser General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. + * GNU Lesser General Public License for more details. * - * You should have received a copy of the GNU General Public License + * You should have received a copy of the GNU Lesser General Public License * along with this program. * If not, see . - * */ namespace Smalot\PdfParser\Element; -use Smalot\PdfParser\Element; use Smalot\PdfParser\Document; +use Smalot\PdfParser\Element; use Smalot\PdfParser\Font; /** * Class ElementName - * - * @package Smalot\PdfParser\Element */ class ElementName extends Element { - /** - * @param string $value - * @param Document $document - */ - public function __construct($value, Document $document = null) + public function __construct(string $value) { parent::__construct($value, null); } - /** - * @param mixed $value - * - * @return bool - */ - public function equals($value) + public function equals($value): bool { return $value == $this->value; } /** - * @param string $content - * @param Document $document - * @param int $offset - * * @return bool|ElementName */ - public static function parse($content, Document $document = null, &$offset = 0) + public static function parse(string $content, ?Document $document = null, int &$offset = 0) { - if (preg_match('/^\s*\/(?P[A-Z0-9\-\+,#\.]+)/is', $content, $match)) { - $name = $match['name']; - $offset += strpos($content, $name) + strlen($name); - $name = Font::decodeEntities($name); + if (preg_match('/^\s*\/([A-Z0-9\-\+,#\.]+)/is', $content, $match)) { + $name = $match[1]; + $offset += strpos($content, $name) + \strlen($name); + $name = Font::decodeEntities($name); - return new self($name, $document); + return new self($name); } return false; diff --git a/src/Smalot/PdfParser/Element/ElementNull.php b/src/Smalot/PdfParser/Element/ElementNull.php index 0ca5be87..9af88434 100644 --- a/src/Smalot/PdfParser/Element/ElementNull.php +++ b/src/Smalot/PdfParser/Element/ElementNull.php @@ -5,81 +5,65 @@ * This file is part of the PdfParser library. * * @author Sébastien MALOT - * @date 2013-08-08 - * @license GPL-3.0 + * + * @date 2017-01-03 + * + * @license LGPLv3 + * * @url * * PdfParser is a pdf library written in PHP, extraction oriented. - * Copyright (C) 2014 - Sébastien MALOT + * Copyright (C) 2017 - Sébastien MALOT * * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by + * it under the terms of the GNU Lesser General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. + * GNU Lesser General Public License for more details. * - * You should have received a copy of the GNU General Public License + * You should have received a copy of the GNU Lesser General Public License * along with this program. * If not, see . - * */ namespace Smalot\PdfParser\Element; -use Smalot\PdfParser\Element; use Smalot\PdfParser\Document; +use Smalot\PdfParser\Element; /** * Class ElementNull - * - * @package Smalot\PdfParser\Element */ class ElementNull extends Element { - /** - * @param string $value - * @param Document $document - */ - public function __construct($value, Document $document = null) + public function __construct() { parent::__construct(null, null); } - /** - * @return string - */ - public function __toString() + public function __toString(): string { return 'null'; } - /** - * @param mixed $value - * - * @return bool - */ - public function equals($value) + public function equals($value): bool { - return ($this->getContent() === $value); + return $this->getContent() === $value; } /** - * @param string $content - * @param Document $document - * @param int $offset - * * @return bool|ElementNull */ - public static function parse($content, Document $document = null, &$offset = 0) + public static function parse(string $content, ?Document $document = null, int &$offset = 0) { if (preg_match('/^\s*(null)/s', $content, $match)) { - $offset += strpos($content, 'null') + strlen('null'); + $offset += strpos($content, 'null') + \strlen('null'); - return new self(null, $document); + return new self(); } return false; diff --git a/src/Smalot/PdfParser/Element/ElementNumeric.php b/src/Smalot/PdfParser/Element/ElementNumeric.php index 772274c7..5454acc0 100644 --- a/src/Smalot/PdfParser/Element/ElementNumeric.php +++ b/src/Smalot/PdfParser/Element/ElementNumeric.php @@ -5,64 +5,56 @@ * This file is part of the PdfParser library. * * @author Sébastien MALOT - * @date 2013-08-08 - * @license GPL-3.0 + * + * @date 2017-01-03 + * + * @license LGPLv3 + * * @url * * PdfParser is a pdf library written in PHP, extraction oriented. - * Copyright (C) 2014 - Sébastien MALOT + * Copyright (C) 2017 - Sébastien MALOT * * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by + * it under the terms of the GNU Lesser General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. + * GNU Lesser General Public License for more details. * - * You should have received a copy of the GNU General Public License + * You should have received a copy of the GNU Lesser General Public License * along with this program. * If not, see . - * */ namespace Smalot\PdfParser\Element; -use Smalot\PdfParser\Element; use Smalot\PdfParser\Document; +use Smalot\PdfParser\Element; /** * Class ElementNumeric - * - * @package Smalot\PdfParser\Element */ class ElementNumeric extends Element { - /** - * @param string $value - * @param Document $document - */ - public function __construct($value, Document $document = null) + public function __construct(string $value) { - parent::__construct(floatval($value), null); + parent::__construct((float) $value, null); } /** - * @param string $content - * @param Document $document - * @param int $offset - * * @return bool|ElementNumeric */ - public static function parse($content, Document $document = null, &$offset = 0) + public static function parse(string $content, ?Document $document = null, int &$offset = 0) { if (preg_match('/^\s*(?P\-?[0-9\.]+)/s', $content, $match)) { - $value = $match['value']; - $offset += strpos($content, $value) + strlen($value); + $value = $match['value']; + $offset += strpos($content, $value) + \strlen($value); - return new self($value, $document); + return new self($value); } return false; diff --git a/src/Smalot/PdfParser/Element/ElementString.php b/src/Smalot/PdfParser/Element/ElementString.php index a4c5ae2e..011bcf46 100644 --- a/src/Smalot/PdfParser/Element/ElementString.php +++ b/src/Smalot/PdfParser/Element/ElementString.php @@ -5,90 +5,77 @@ * This file is part of the PdfParser library. * * @author Sébastien MALOT - * @date 2013-08-08 - * @license GPL-3.0 + * + * @date 2017-01-03 + * + * @license LGPLv3 + * * @url * * PdfParser is a pdf library written in PHP, extraction oriented. - * Copyright (C) 2014 - Sébastien MALOT + * Copyright (C) 2017 - Sébastien MALOT * * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by + * it under the terms of the GNU Lesser General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. + * GNU Lesser General Public License for more details. * - * You should have received a copy of the GNU General Public License + * You should have received a copy of the GNU Lesser General Public License * along with this program. * If not, see . - * */ namespace Smalot\PdfParser\Element; -use Smalot\PdfParser\Element; use Smalot\PdfParser\Document; +use Smalot\PdfParser\Element; use Smalot\PdfParser\Font; /** * Class ElementString - * - * @package Smalot\PdfParser\Element */ class ElementString extends Element { - /** - * @param string $value - * @param Document $document - */ - public function __construct($value, Document $document = null) + public function __construct($value) { parent::__construct($value, null); } - /** - * @param mixed $value - * - * @return bool - */ - public function equals($value) + public function equals($value): bool { return $value == $this->value; } /** - * @param string $content - * @param Document $document - * @param int $offset - * * @return bool|ElementString */ - public static function parse($content, Document $document = null, &$offset = 0) + public static function parse(string $content, ?Document $document = null, int &$offset = 0) { if (preg_match('/^\s*\((?P.*)/s', $content, $match)) { $name = $match['name']; // Find next ')' not escaped. $cur_start_text = $start_search_end = 0; - while (($cur_start_pos = strpos($name, ')', $start_search_end)) !== false) { + while (false !== ($cur_start_pos = strpos($name, ')', $start_search_end))) { $cur_extract = substr($name, $cur_start_text, $cur_start_pos - $cur_start_text); preg_match('/(?P[\\\]*)$/s', $cur_extract, $match); - if (!(strlen($match['escape']) % 2)) { + if (!(\strlen($match['escape']) % 2)) { break; } $start_search_end = $cur_start_pos + 1; } // Extract string. - $name = substr($name, 0, $cur_start_pos); + $name = substr($name, 0, (int) $cur_start_pos); $offset += strpos($content, '(') + $cur_start_pos + 2; // 2 for '(' and ')' - $name = str_replace( - array('\\\\', '\\ ', '\\/', '\(', '\)', '\n', '\r', '\t'), - array('\\', ' ', '/', '(', ')', "\n", "\r", "\t"), + $name = str_replace( + ['\\\\', '\\ ', '\\/', '\(', '\)', '\n', '\r', '\t'], + ['\\', ' ', '/', '(', ')', "\n", "\r", "\t"], $name ); @@ -98,7 +85,7 @@ public static function parse($content, Document $document = null, &$offset = 0) $name = Font::decodeHexadecimal($name, false); $name = Font::decodeUnicode($name); - return new self($name, $document); + return new self($name); } return false; diff --git a/src/Smalot/PdfParser/Element/ElementStruct.php b/src/Smalot/PdfParser/Element/ElementStruct.php index ff5c00e7..c37b6da4 100644 --- a/src/Smalot/PdfParser/Element/ElementStruct.php +++ b/src/Smalot/PdfParser/Element/ElementStruct.php @@ -5,74 +5,69 @@ * This file is part of the PdfParser library. * * @author Sébastien MALOT - * @date 2013-08-08 - * @license GPL-3.0 + * + * @date 2017-01-03 + * + * @license LGPLv3 + * * @url * * PdfParser is a pdf library written in PHP, extraction oriented. - * Copyright (C) 2014 - Sébastien MALOT + * Copyright (C) 2017 - Sébastien MALOT * * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by + * it under the terms of the GNU Lesser General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. + * GNU Lesser General Public License for more details. * - * You should have received a copy of the GNU General Public License + * You should have received a copy of the GNU Lesser General Public License * along with this program. * If not, see . - * */ namespace Smalot\PdfParser\Element; -use Smalot\PdfParser\Element; use Smalot\PdfParser\Document; +use Smalot\PdfParser\Element; use Smalot\PdfParser\Header; /** * Class ElementStruct - * - * @package Smalot\PdfParser\Element */ class ElementStruct extends Element { /** - * @param string $content - * @param Document $document - * @param int $offset - * - * @return bool|ElementStruct + * @return false|Header */ - public static function parse($content, Document $document = null, &$offset = 0) + public static function parse(string $content, ?Document $document = null, int &$offset = 0) { if (preg_match('/^\s*<<(?P.*)/is', $content)) { preg_match_all('/(.*?)(<<|>>)/s', trim($content), $matches); $level = 0; - $sub = ''; + $sub = ''; foreach ($matches[0] as $part) { $sub .= $part; - $level += (strpos($part, '<<') !== false ? 1 : -1); + $level += (false !== strpos($part, '<<') ? 1 : -1); if ($level <= 0) { break; } } - $offset += strpos($content, '<<') + strlen(rtrim($sub)); + $offset += strpos($content, '<<') + \strlen(rtrim($sub)); // Removes '<<' and '>>'. - $sub = trim(preg_replace('/^\s*<<(.*)>>\s*$/s', '\\1', $sub)); + $sub = trim((string) preg_replace('/^\s*<<(.*)>>\s*$/s', '\\1', $sub)); $position = 0; $elements = Element::parse($sub, $document, $position); - $header = new Header($elements, $document); - return $header; + return new Header($elements, $document); } return false; diff --git a/src/Smalot/PdfParser/Element/ElementXRef.php b/src/Smalot/PdfParser/Element/ElementXRef.php index 01072214..ebba71a1 100644 --- a/src/Smalot/PdfParser/Element/ElementXRef.php +++ b/src/Smalot/PdfParser/Element/ElementXRef.php @@ -5,89 +5,89 @@ * This file is part of the PdfParser library. * * @author Sébastien MALOT - * @date 2013-08-08 - * @license GPL-3.0 + * + * @date 2017-01-03 + * + * @license LGPLv3 + * * @url * * PdfParser is a pdf library written in PHP, extraction oriented. - * Copyright (C) 2014 - Sébastien MALOT + * Copyright (C) 2017 - Sébastien MALOT * * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by + * it under the terms of the GNU Lesser General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. + * GNU Lesser General Public License for more details. * - * You should have received a copy of the GNU General Public License + * You should have received a copy of the GNU Lesser General Public License * along with this program. * If not, see . - * */ namespace Smalot\PdfParser\Element; -use Smalot\PdfParser\Element; use Smalot\PdfParser\Document; +use Smalot\PdfParser\Element; /** * Class ElementXRef - * - * @package Smalot\PdfParser\Element */ class ElementXRef extends Element { - /** - * @return string - */ - public function getId() + public function getId(): string { return $this->getContent(); } - /** - * @return mixed - */ public function getObject() { return $this->document->getObjectById($this->getId()); } - /** - * @param mixed $value - * - * @return bool - */ - public function equals($value) + public function equals($value): bool { - $id = ($value instanceof ElementXRef) ? $value->getId() : $value; + /** + * In case $value is a number and $this->value is a string like 5_0 + * + * Without this if-clause code like: + * + * $element = new ElementXRef('5_0'); + * $this->assertTrue($element->equals(5)); + * + * would fail (= 5_0 and 5 are not equal in PHP 8.0+). + */ + if ( + true === is_numeric($value) + && true === \is_string($this->getContent()) + && 1 === preg_match('/[0-9]+\_[0-9]+/', $this->getContent(), $matches) + ) { + return (float) $this->getContent() == $value; + } + + $id = ($value instanceof self) ? $value->getId() : $value; return $this->getId() == $id; } - /** - * @return string - */ - public function __toString() + public function __toString(): string { - return '#Obj#' . $this->getId(); + return '#Obj#'.$this->getId(); } /** - * @param string $content - * @param Document $document - * @param int $offset - * * @return bool|ElementXRef */ - public static function parse($content, Document $document = null, &$offset = 0) + public static function parse(string $content, ?Document $document = null, int &$offset = 0) { if (preg_match('/^\s*(?P[0-9]+\s+[0-9]+\s+R)/s', $content, $match)) { $id = $match['id']; - $offset += strpos($content, $id) + strlen($id); + $offset += strpos($content, $id) + \strlen($id); $id = str_replace(' ', '_', rtrim($id, ' R')); return new self($id, $document); diff --git a/src/Smalot/PdfParser/Encoding.php b/src/Smalot/PdfParser/Encoding.php index 852cc201..511411b8 100644 --- a/src/Smalot/PdfParser/Encoding.php +++ b/src/Smalot/PdfParser/Encoding.php @@ -5,39 +5,42 @@ * This file is part of the PdfParser library. * * @author Sébastien MALOT - * @date 2013-08-08 - * @license GPL-3.0 + * + * @date 2017-01-03 + * + * @license LGPLv3 + * * @url * * PdfParser is a pdf library written in PHP, extraction oriented. - * Copyright (C) 2014 - Sébastien MALOT + * Copyright (C) 2017 - Sébastien MALOT * * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by + * it under the terms of the GNU Lesser General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. + * GNU Lesser General Public License for more details. * - * You should have received a copy of the GNU General Public License + * You should have received a copy of the GNU Lesser General Public License * along with this program. * If not, see . - * */ namespace Smalot\PdfParser; use Smalot\PdfParser\Element\ElementNumeric; +use Smalot\PdfParser\Encoding\EncodingLocator; +use Smalot\PdfParser\Encoding\PostScriptGlyphs; +use Smalot\PdfParser\Exception\EncodingNotFoundException; /** * Class Encoding - * - * @package Smalot\PdfParser */ -class Encoding extends Object +class Encoding extends PDFObject { /** * @var array @@ -54,30 +57,22 @@ class Encoding extends Object */ protected $mapping; - /** - * - */ public function init() { - $this->mapping = array(); - $this->differences = array(); - $this->encoding = null; + $this->mapping = []; + $this->differences = []; + $this->encoding = []; if ($this->has('BaseEncoding')) { - // Load reference table charset. - $baseEncoding = preg_replace('/[^A-Z0-9]/is', '', $this->get('BaseEncoding')->getContent()); - $className = '\\Smalot\\PdfParser\\Encoding\\' . $baseEncoding; - - if (class_exists($className)) { - $class = new $className(); - $this->encoding = $class->getTranslations(); - } else { - throw new \Exception('Missing encoding data for: "' . $baseEncoding . '".'); - } + $this->encoding = EncodingLocator::getEncoding($this->getEncodingClass())->getTranslations(); // Build table including differences. $differences = $this->get('Differences')->getContent(); - $code = 0; + $code = 0; + + if (!\is_array($differences)) { + return; + } foreach ($differences as $difference) { /** @var ElementNumeric $difference */ @@ -87,48 +82,81 @@ public function init() } // ElementName - $this->differences[$code] = $difference->getContent(); + $this->differences[$code] = $difference; + if (\is_object($difference)) { + $this->differences[$code] = $difference->getContent(); + } // For the next char. - $code++; + ++$code; } - // Build final mapping (custom => standard). - $table = array_flip(array_reverse($this->encoding, true)); - + $this->mapping = $this->encoding; foreach ($this->differences as $code => $difference) { - /** @var string $difference */ - $this->mapping[$code] = (isset($table[$difference]) ? $table[$difference] : Font::MISSING); + /* @var string $difference */ + $this->mapping[$code] = $difference; } } } - /** - * @return array - */ - public function getDetails($deep = true) + public function getDetails(bool $deep = true): array { - $details = array(); + $details = []; - $details['BaseEncoding'] = ($this->has('BaseEncoding') ? (string)$this->get('BaseEncoding') : 'Ansi'); - $details['Differences'] = ($this->has('Differences') ? (string)$this->get('Differences') : ''); + $details['BaseEncoding'] = ($this->has('BaseEncoding') ? (string) $this->get('BaseEncoding') : 'Ansi'); + $details['Differences'] = ($this->has('Differences') ? (string) $this->get('Differences') : ''); $details += parent::getDetails($deep); return $details; } + public function translateChar($dec): ?int + { + if (isset($this->mapping[$dec])) { + $dec = $this->mapping[$dec]; + } + + return PostScriptGlyphs::getCodePoint($dec); + } + /** - * @param int $char + * Returns encoding class name if available or empty string (only prior PHP 7.4). * - * @return int + * @throws \Exception On PHP 7.4+ an exception is thrown if encoding class doesn't exist. */ - public function translateChar($dec) + public function __toString(): string { - if (isset($this->mapping[$dec])) { - $dec = $this->mapping[$dec]; + try { + return $this->getEncodingClass(); + } catch (\Exception $e) { + // prior to PHP 7.4 toString has to return an empty string. + if (version_compare(\PHP_VERSION, '7.4.0', '<')) { + return ''; + } + throw $e; + } + } + + /** + * @throws EncodingNotFoundException + */ + protected function getEncodingClass(): string + { + // Load reference table charset. + $baseEncoding = preg_replace('/[^A-Z0-9]/is', '', $this->get('BaseEncoding')->getContent()); + + // Check for empty BaseEncoding field value + if (!\is_string($baseEncoding) || 0 == \strlen($baseEncoding)) { + $baseEncoding = 'StandardEncoding'; + } + + $className = '\\Smalot\\PdfParser\\Encoding\\'.$baseEncoding; + + if (!class_exists($className)) { + throw new EncodingNotFoundException('Missing encoding data for: "'.$baseEncoding.'".'); } - return $dec; + return $className; } } diff --git a/src/Smalot/PdfParser/Encoding/AbstractEncoding.php b/src/Smalot/PdfParser/Encoding/AbstractEncoding.php new file mode 100644 index 00000000..aea9c020 --- /dev/null +++ b/src/Smalot/PdfParser/Encoding/AbstractEncoding.php @@ -0,0 +1,8 @@ + - * @date 2013-08-08 - * @license GPL-3.0 + * + * @date 2017-01-03 + * + * @license LGPLv3 + * * @url * * PdfParser is a pdf library written in PHP, extraction oriented. - * Copyright (C) 2014 - Sébastien MALOT + * Copyright (C) 2017 - Sébastien MALOT * * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by + * it under the terms of the GNU Lesser General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. + * GNU Lesser General Public License for more details. * - * You should have received a copy of the GNU General Public License + * You should have received a copy of the GNU Lesser General Public License * along with this program. * If not, see . - * */ // Source : http://cpansearch.perl.org/src/JV/PostScript-Font-1.10.02/lib/PostScript/ISOLatin1Encoding.pm @@ -34,41 +36,39 @@ /** * Class ISOLatin1Encoding - * - * @package Smalot\PdfParser\Encoding */ -class ISOLatin1Encoding +class ISOLatin1Encoding extends AbstractEncoding { - public function getTranslations() + public function getTranslations(): array { $encoding = - '.notdef .notdef .notdef .notdef .notdef .notdef .notdef .notdef ' . - '.notdef .notdef .notdef .notdef .notdef .notdef .notdef .notdef ' . - '.notdef .notdef .notdef .notdef .notdef .notdef .notdef .notdef ' . - '.notdef .notdef .notdef .notdef .notdef .notdef .notdef .notdef ' . - 'space exclam quotedbl numbersign dollar percent ampersand quoteright ' . - 'parenleft parenright asterisk plus comma minus period slash zero one ' . - 'two three four five six seven eight nine colon semicolon less equal ' . - 'greater question at A B C D E F G H I J K L M N O P Q R S T U V W X ' . - 'Y Z bracketleft backslash bracketright asciicircum underscore ' . - 'quoteleft a b c d e f g h i j k l m n o p q r s t u v w x y z ' . - 'braceleft bar braceright asciitilde .notdef .notdef .notdef .notdef ' . - '.notdef .notdef .notdef .notdef .notdef .notdef .notdef .notdef ' . - '.notdef .notdef .notdef .notdef .notdef dotlessi grave acute ' . - 'circumflex tilde macron breve dotaccent dieresis .notdef ring ' . - 'cedilla .notdef hungarumlaut ogonek caron space exclamdown cent ' . - 'sterling currency yen brokenbar section dieresis copyright ' . - 'ordfeminine guillemotleft logicalnot hyphen registered macron degree ' . - 'plusminus twosuperior threesuperior acute mu paragraph ' . - 'periodcentered cedilla onesuperior ordmasculine guillemotright ' . - 'onequarter onehalf threequarters questiondown Agrave Aacute ' . - 'Acircumflex Atilde Adieresis Aring AE Ccedilla Egrave Eacute ' . - 'Ecircumflex Edieresis Igrave Iacute Icircumflex Idieresis Eth Ntilde ' . - 'Ograve Oacute Ocircumflex Otilde Odieresis multiply Oslash Ugrave ' . - 'Uacute Ucircumflex Udieresis Yacute Thorn germandbls agrave aacute ' . - 'acircumflex atilde adieresis aring ae ccedilla egrave eacute ' . - 'ecircumflex edieresis igrave iacute icircumflex idieresis eth ntilde ' . - 'ograve oacute ocircumflex otilde odieresis divide oslash ugrave ' . + '.notdef .notdef .notdef .notdef .notdef .notdef .notdef .notdef '. + '.notdef .notdef .notdef .notdef .notdef .notdef .notdef .notdef '. + '.notdef .notdef .notdef .notdef .notdef .notdef .notdef .notdef '. + '.notdef .notdef .notdef .notdef .notdef .notdef .notdef .notdef '. + 'space exclam quotedbl numbersign dollar percent ampersand quoteright '. + 'parenleft parenright asterisk plus comma minus period slash zero one '. + 'two three four five six seven eight nine colon semicolon less equal '. + 'greater question at A B C D E F G H I J K L M N O P Q R S T U V W X '. + 'Y Z bracketleft backslash bracketright asciicircum underscore '. + 'quoteleft a b c d e f g h i j k l m n o p q r s t u v w x y z '. + 'braceleft bar braceright asciitilde .notdef .notdef .notdef .notdef '. + '.notdef .notdef .notdef .notdef .notdef .notdef .notdef .notdef '. + '.notdef .notdef .notdef .notdef .notdef dotlessi grave acute '. + 'circumflex tilde macron breve dotaccent dieresis .notdef ring '. + 'cedilla .notdef hungarumlaut ogonek caron space exclamdown cent '. + 'sterling currency yen brokenbar section dieresis copyright '. + 'ordfeminine guillemotleft logicalnot hyphen registered macron degree '. + 'plusminus twosuperior threesuperior acute mu paragraph '. + 'periodcentered cedilla onesuperior ordmasculine guillemotright '. + 'onequarter onehalf threequarters questiondown Agrave Aacute '. + 'Acircumflex Atilde Adieresis Aring AE Ccedilla Egrave Eacute '. + 'Ecircumflex Edieresis Igrave Iacute Icircumflex Idieresis Eth Ntilde '. + 'Ograve Oacute Ocircumflex Otilde Odieresis multiply Oslash Ugrave '. + 'Uacute Ucircumflex Udieresis Yacute Thorn germandbls agrave aacute '. + 'acircumflex atilde adieresis aring ae ccedilla egrave eacute '. + 'ecircumflex edieresis igrave iacute icircumflex idieresis eth ntilde '. + 'ograve oacute ocircumflex otilde odieresis divide oslash ugrave '. 'uacute ucircumflex udieresis yacute thorn ydieresis'; return explode(' ', $encoding); diff --git a/src/Smalot/PdfParser/Encoding/ISOLatin9Encoding.php b/src/Smalot/PdfParser/Encoding/ISOLatin9Encoding.php index d58cc252..616a0f55 100644 --- a/src/Smalot/PdfParser/Encoding/ISOLatin9Encoding.php +++ b/src/Smalot/PdfParser/Encoding/ISOLatin9Encoding.php @@ -5,27 +5,29 @@ * This file is part of the PdfParser library. * * @author Sébastien MALOT - * @date 2013-08-08 - * @license GPL-3.0 + * + * @date 2017-01-03 + * + * @license LGPLv3 + * * @url * * PdfParser is a pdf library written in PHP, extraction oriented. - * Copyright (C) 2014 - Sébastien MALOT + * Copyright (C) 2017 - Sébastien MALOT * * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by + * it under the terms of the GNU Lesser General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. + * GNU Lesser General Public License for more details. * - * You should have received a copy of the GNU General Public License + * You should have received a copy of the GNU Lesser General Public License * along with this program. * If not, see . - * */ // Source : http://cpansearch.perl.org/src/JV/PostScript-Font-1.10.02/lib/PostScript/ISOLatin9Encoding.pm @@ -34,41 +36,39 @@ /** * Class ISOLatin9Encoding - * - * @package Smalot\PdfParser\Encoding */ -class ISOLatin9Encoding +class ISOLatin9Encoding extends AbstractEncoding { - public function getTranslations() + public function getTranslations(): array { $encoding = - '.notdef .notdef .notdef .notdef .notdef .notdef .notdef .notdef ' . - '.notdef .notdef .notdef .notdef .notdef .notdef .notdef .notdef ' . - '.notdef .notdef .notdef .notdef .notdef .notdef .notdef .notdef ' . - '.notdef .notdef .notdef .notdef .notdef .notdef .notdef .notdef ' . - 'space exclam quotedbl numbersign dollar percent ampersand quoteright ' . - 'parenleft parenright asterisk plus comma minus period slash zero one ' . - 'two three four five six seven eight nine colon semicolon less equal ' . - 'greater question at A B C D E F G H I J K L M N O P Q R S T U V W X ' . - 'Y Z bracketleft backslash bracketright asciicircum underscore ' . - 'quoteleft a b c d e f g h i j k l m n o p q r s t u v w x y z ' . - 'braceleft bar braceright asciitilde .notdef .notdef .notdef .notdef ' . - '.notdef .notdef .notdef .notdef .notdef .notdef .notdef .notdef ' . - '.notdef .notdef .notdef .notdef .notdef dotlessi grave acute ' . - 'circumflex tilde macron breve dotaccent dieresis .notdef ring ' . - 'cedilla .notdef hungarumlaut ogonek caron space exclamdown cent ' . - 'sterling Euro yen Scaron section scaron copyright ' . - 'ordfeminine guillemotleft logicalnot hyphen registered macron degree ' . - 'plusminus twosuperior threesuperior Zcaron mu paragraph ' . - 'periodcentered zcaron onesuperior ordmasculine guillemotright ' . - 'OE oe Ydieresis questiondown Agrave Aacute ' . - 'Acircumflex Atilde Adieresis Aring AE Ccedilla Egrave Eacute ' . - 'Ecircumflex Edieresis Igrave Iacute Icircumflex Idieresis Eth Ntilde ' . - 'Ograve Oacute Ocircumflex Otilde Odieresis multiply Oslash Ugrave ' . - 'Uacute Ucircumflex Udieresis Yacute Thorn germandbls agrave aacute ' . - 'acircumflex atilde adieresis aring ae ccedilla egrave eacute ' . - 'ecircumflex edieresis igrave iacute icircumflex idieresis eth ntilde ' . - 'ograve oacute ocircumflex otilde odieresis divide oslash ugrave ' . + '.notdef .notdef .notdef .notdef .notdef .notdef .notdef .notdef '. + '.notdef .notdef .notdef .notdef .notdef .notdef .notdef .notdef '. + '.notdef .notdef .notdef .notdef .notdef .notdef .notdef .notdef '. + '.notdef .notdef .notdef .notdef .notdef .notdef .notdef .notdef '. + 'space exclam quotedbl numbersign dollar percent ampersand quoteright '. + 'parenleft parenright asterisk plus comma minus period slash zero one '. + 'two three four five six seven eight nine colon semicolon less equal '. + 'greater question at A B C D E F G H I J K L M N O P Q R S T U V W X '. + 'Y Z bracketleft backslash bracketright asciicircum underscore '. + 'quoteleft a b c d e f g h i j k l m n o p q r s t u v w x y z '. + 'braceleft bar braceright asciitilde .notdef .notdef .notdef .notdef '. + '.notdef .notdef .notdef .notdef .notdef .notdef .notdef .notdef '. + '.notdef .notdef .notdef .notdef .notdef dotlessi grave acute '. + 'circumflex tilde macron breve dotaccent dieresis .notdef ring '. + 'cedilla .notdef hungarumlaut ogonek caron space exclamdown cent '. + 'sterling Euro yen Scaron section scaron copyright '. + 'ordfeminine guillemotleft logicalnot hyphen registered macron degree '. + 'plusminus twosuperior threesuperior Zcaron mu paragraph '. + 'periodcentered zcaron onesuperior ordmasculine guillemotright '. + 'OE oe Ydieresis questiondown Agrave Aacute '. + 'Acircumflex Atilde Adieresis Aring AE Ccedilla Egrave Eacute '. + 'Ecircumflex Edieresis Igrave Iacute Icircumflex Idieresis Eth Ntilde '. + 'Ograve Oacute Ocircumflex Otilde Odieresis multiply Oslash Ugrave '. + 'Uacute Ucircumflex Udieresis Yacute Thorn germandbls agrave aacute '. + 'acircumflex atilde adieresis aring ae ccedilla egrave eacute '. + 'ecircumflex edieresis igrave iacute icircumflex idieresis eth ntilde '. + 'ograve oacute ocircumflex otilde odieresis divide oslash ugrave '. 'uacute ucircumflex udieresis yacute thorn ydieresis'; return explode(' ', $encoding); diff --git a/src/Smalot/PdfParser/Encoding/MacRomanEncoding.php b/src/Smalot/PdfParser/Encoding/MacRomanEncoding.php index e0e3d70b..c47131c9 100644 --- a/src/Smalot/PdfParser/Encoding/MacRomanEncoding.php +++ b/src/Smalot/PdfParser/Encoding/MacRomanEncoding.php @@ -5,27 +5,29 @@ * This file is part of the PdfParser library. * * @author Sébastien MALOT - * @date 2013-08-08 - * @license GPL-3.0 + * + * @date 2017-01-03 + * + * @license LGPLv3 + * * @url * * PdfParser is a pdf library written in PHP, extraction oriented. - * Copyright (C) 2014 - Sébastien MALOT + * Copyright (C) 2017 - Sébastien MALOT * * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by + * it under the terms of the GNU Lesser General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. + * GNU Lesser General Public License for more details. * - * You should have received a copy of the GNU General Public License + * You should have received a copy of the GNU Lesser General Public License * along with this program. * If not, see . - * */ // Source : http://www.opensource.apple.com/source/vim/vim-34/vim/runtime/print/mac-roman.ps @@ -34,45 +36,43 @@ /** * Class MacRomanEncoding - * - * @package Smalot\PdfParser\Encoding */ -class MacRomanEncoding +class MacRomanEncoding extends AbstractEncoding { - public function getTranslations() + public function getTranslations(): array { $encoding = - '.notdef .notdef .notdef .notdef .notdef .notdef .notdef .notdef ' . - '.notdef .notdef .notdef .notdef .notdef .notdef .notdef .notdef ' . - '.notdef .notdef .notdef .notdef .notdef .notdef .notdef .notdef ' . - '.notdef .notdef .notdef .notdef .notdef .notdef .notdef .notdef ' . - 'space exclam quotedbl numbersign dollar percent ampersand quotesingle ' . - 'parenleft parenright asterisk plus comma minus period slash ' . - 'zero one two three four five six seven ' . - 'eight nine colon semicolon less equal greater question ' . - 'at A B C D E F G ' . - 'H I J K L M N O ' . - 'P Q R S T U V W ' . - 'X Y Z bracketleft backslash bracketright asciicircum underscore ' . - 'grave a b c d e f g ' . - 'h i j k l m n o ' . - 'p q r s t u v w ' . - 'x y z braceleft bar braceright asciitilde .notdef ' . - 'Adieresis Aring Ccedilla Eacute Ntilde Odieresis Udieresis aacute ' . - 'agrave acircumflex adieresis atilde aring ccedilla eacute egrave ' . - 'ecircumflex edieresis iacute igrave icircumflex idieresis ntilde oacute ' . - 'ograve ocircumflex odieresis otilde uacute ugrave ucircumflex udieresis ' . - 'dagger degree cent sterling section bullet paragraph germandbls ' . - 'registered copyright trademark acute dieresis notequal AE Oslash ' . - 'infinity plusminus lessequal greaterequal yen mu partialdiff summation ' . - 'Pi pi integral ordfeminine ordmasculine Omega ae oslash ' . - 'questiondown exclamdown logicalnot radical florin approxequal delta guillemotleft ' . - 'guillemotright ellipsis space Agrave Atilde Otilde OE oe ' . - 'endash emdash quotedblleft quotedblright quoteleft quoteright divide lozenge ' . - 'ydieresis Ydieresis fraction currency guilsinglleft guilsinglright fi fl ' . - 'daggerdbl periodcentered quotesinglbase quotedblbase perthousand Acircumflex Ecircumflex Aacute ' . - 'Edieresis Egrave Iacute Icircumflex Idieresis Igrave Oacute Ocircumflex ' . - 'heart Ograve Uacute Ucircumflex Ugrave dotlessi circumflex tilde ' . + '.notdef .notdef .notdef .notdef .notdef .notdef .notdef .notdef '. + '.notdef .notdef .notdef .notdef .notdef .notdef .notdef .notdef '. + '.notdef .notdef .notdef .notdef .notdef .notdef .notdef .notdef '. + '.notdef .notdef .notdef .notdef .notdef .notdef .notdef .notdef '. + 'space exclam quotedbl numbersign dollar percent ampersand quotesingle '. + 'parenleft parenright asterisk plus comma minus period slash '. + 'zero one two three four five six seven '. + 'eight nine colon semicolon less equal greater question '. + 'at A B C D E F G '. + 'H I J K L M N O '. + 'P Q R S T U V W '. + 'X Y Z bracketleft backslash bracketright asciicircum underscore '. + 'grave a b c d e f g '. + 'h i j k l m n o '. + 'p q r s t u v w '. + 'x y z braceleft bar braceright asciitilde .notdef '. + 'Adieresis Aring Ccedilla Eacute Ntilde Odieresis Udieresis aacute '. + 'agrave acircumflex adieresis atilde aring ccedilla eacute egrave '. + 'ecircumflex edieresis iacute igrave icircumflex idieresis ntilde oacute '. + 'ograve ocircumflex odieresis otilde uacute ugrave ucircumflex udieresis '. + 'dagger degree cent sterling section bullet paragraph germandbls '. + 'registered copyright trademark acute dieresis notequal AE Oslash '. + 'infinity plusminus lessequal greaterequal yen mu partialdiff summation '. + 'Pi pi integral ordfeminine ordmasculine Omega ae oslash '. + 'questiondown exclamdown logicalnot radical florin approxequal delta guillemotleft '. + 'guillemotright ellipsis space Agrave Atilde Otilde OE oe '. + 'endash emdash quotedblleft quotedblright quoteleft quoteright divide lozenge '. + 'ydieresis Ydieresis fraction currency guilsinglleft guilsinglright fi fl '. + 'daggerdbl periodcentered quotesinglbase quotedblbase perthousand Acircumflex Ecircumflex Aacute '. + 'Edieresis Egrave Iacute Icircumflex Idieresis Igrave Oacute Ocircumflex '. + 'heart Ograve Uacute Ucircumflex Ugrave dotlessi circumflex tilde '. 'macron breve dotaccent ring cedilla hungarumlaut ogonek caron'; return explode(' ', $encoding); diff --git a/src/Smalot/PdfParser/Encoding/PDFDocEncoding.php b/src/Smalot/PdfParser/Encoding/PDFDocEncoding.php new file mode 100644 index 00000000..70bc48cb --- /dev/null +++ b/src/Smalot/PdfParser/Encoding/PDFDocEncoding.php @@ -0,0 +1,189 @@ + + * + * @date 2023-06-28 + * + * @license LGPLv3 + * + * @url + * + * PdfParser is a pdf library written in PHP, extraction oriented. + * Copyright (C) 2017 - Sébastien MALOT + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program. + * If not, see . + */ + +// Source : https://opensource.adobe.com/dc-acrobat-sdk-docs/pdfstandards/pdfreference1.2.pdf +// Source : https://ia801001.us.archive.org/1/items/pdf1.7/pdf_reference_1-7.pdf + +namespace Smalot\PdfParser\Encoding; + +/** + * Class PDFDocEncoding + */ +class PDFDocEncoding +{ + public static function getCodePage(): array + { + return [ + "\x18" => "\u{02d8}", // breve + "\x19" => "\u{02c7}", // caron + "\x1a" => "\u{02c6}", // circumflex + "\x1b" => "\u{02d9}", // dotaccent + "\x1c" => "\u{02dd}", // hungarumlaut + "\x1d" => "\u{02db}", // ogonek + "\x1e" => "\u{02de}", // ring + "\x1f" => "\u{02dc}", // tilde + "\x7f" => '', + "\x80" => "\u{2022}", // bullet + "\x81" => "\u{2020}", // dagger + "\x82" => "\u{2021}", // daggerdbl + "\x83" => "\u{2026}", // ellipsis + "\x84" => "\u{2014}", // emdash + "\x85" => "\u{2013}", // endash + "\x86" => "\u{0192}", // florin + "\x87" => "\u{2044}", // fraction + "\x88" => "\u{2039}", // guilsinglleft + "\x89" => "\u{203a}", // guilsinglright + "\x8a" => "\u{2212}", // minus + "\x8b" => "\u{2030}", // perthousand + "\x8c" => "\u{201e}", // quotedblbase + "\x8d" => "\u{201c}", // quotedblleft + "\x8e" => "\u{201d}", // quotedblright + "\x8f" => "\u{2018}", // quoteleft + "\x90" => "\u{2019}", // quoteright + "\x91" => "\u{201a}", // quotesinglbase + "\x92" => "\u{2122}", // trademark + "\x93" => "\u{fb01}", // fi + "\x94" => "\u{fb02}", // fl + "\x95" => "\u{0141}", // Lslash + "\x96" => "\u{0152}", // OE + "\x97" => "\u{0160}", // Scaron + "\x98" => "\u{0178}", // Ydieresis + "\x99" => "\u{017d}", // Zcaron + "\x9a" => "\u{0131}", // dotlessi + "\x9b" => "\u{0142}", // lslash + "\x9c" => "\u{0153}", // oe + "\x9d" => "\u{0161}", // scaron + "\x9e" => "\u{017e}", // zcaron + "\x9f" => '', + "\xa0" => "\u{20ac}", // Euro + "\xa1" => "\u{00a1}", // exclamdown + "\xa2" => "\u{00a2}", // cent + "\xa3" => "\u{00a3}", // sterling + "\xa4" => "\u{00a4}", // currency + "\xa5" => "\u{00a5}", // yen + "\xa6" => "\u{00a6}", // brokenbar + "\xa7" => "\u{00a7}", // section + "\xa8" => "\u{00a8}", // dieresis + "\xa9" => "\u{00a9}", // copyright + "\xaa" => "\u{00aa}", // ordfeminine + "\xab" => "\u{00ab}", // guillemotleft + "\xac" => "\u{00ac}", // logicalnot + "\xad" => '', + "\xae" => "\u{00ae}", // registered + "\xaf" => "\u{00af}", // macron + "\xb0" => "\u{00b0}", // degree + "\xb1" => "\u{00b1}", // plusminus + "\xb2" => "\u{00b2}", // twosuperior + "\xb3" => "\u{00b3}", // threesuperior + "\xb4" => "\u{00b4}", // acute + "\xb5" => "\u{00b5}", // mu + "\xb6" => "\u{00b6}", // paragraph + "\xb7" => "\u{00b7}", // periodcentered + "\xb8" => "\u{00b8}", // cedilla + "\xb9" => "\u{00b9}", // onesuperior + "\xba" => "\u{00ba}", // ordmasculine + "\xbb" => "\u{00bb}", // guillemotright + "\xbc" => "\u{00bc}", // onequarter + "\xbd" => "\u{00bd}", // onehalf + "\xbe" => "\u{00be}", // threequarters + "\xbf" => "\u{00bf}", // questiondown + "\xc0" => "\u{00c0}", // Agrave + "\xc1" => "\u{00c1}", // Aacute + "\xc2" => "\u{00c2}", // Acircumflex + "\xc3" => "\u{00c3}", // Atilde + "\xc4" => "\u{00c4}", // Adieresis + "\xc5" => "\u{00c5}", // Aring + "\xc6" => "\u{00c6}", // AE + "\xc7" => "\u{00c7}", // Ccedill + "\xc8" => "\u{00c8}", // Egrave + "\xc9" => "\u{00c9}", // Eacute + "\xca" => "\u{00ca}", // Ecircumflex + "\xcb" => "\u{00cb}", // Edieresis + "\xcc" => "\u{00cc}", // Igrave + "\xcd" => "\u{00cd}", // Iacute + "\xce" => "\u{00ce}", // Icircumflex + "\xcf" => "\u{00cf}", // Idieresis + "\xd0" => "\u{00d0}", // Eth + "\xd1" => "\u{00d1}", // Ntilde + "\xd2" => "\u{00d2}", // Ograve + "\xd3" => "\u{00d3}", // Oacute + "\xd4" => "\u{00d4}", // Ocircumflex + "\xd5" => "\u{00d5}", // Otilde + "\xd6" => "\u{00d6}", // Odieresis + "\xd7" => "\u{00d7}", // multiply + "\xd8" => "\u{00d8}", // Oslash + "\xd9" => "\u{00d9}", // Ugrave + "\xda" => "\u{00da}", // Uacute + "\xdb" => "\u{00db}", // Ucircumflex + "\xdc" => "\u{00dc}", // Udieresis + "\xdd" => "\u{00dd}", // Yacute + "\xde" => "\u{00de}", // Thorn + "\xdf" => "\u{00df}", // germandbls + "\xe0" => "\u{00e0}", // agrave + "\xe1" => "\u{00e1}", // aacute + "\xe2" => "\u{00e2}", // acircumflex + "\xe3" => "\u{00e3}", // atilde + "\xe4" => "\u{00e4}", // adieresis + "\xe5" => "\u{00e5}", // aring + "\xe6" => "\u{00e6}", // ae + "\xe7" => "\u{00e7}", // ccedilla + "\xe8" => "\u{00e8}", // egrave + "\xe9" => "\u{00e9}", // eacute + "\xea" => "\u{00ea}", // ecircumflex + "\xeb" => "\u{00eb}", // edieresis + "\xec" => "\u{00ec}", // igrave + "\xed" => "\u{00ed}", // iacute + "\xee" => "\u{00ee}", // icircumflex + "\xef" => "\u{00ef}", // idieresis + "\xf0" => "\u{00f0}", // eth + "\xf1" => "\u{00f1}", // ntilde + "\xf2" => "\u{00f2}", // ograve + "\xf3" => "\u{00f3}", // oacute + "\xf4" => "\u{00f4}", // ocircumflex + "\xf5" => "\u{00f5}", // otilde + "\xf6" => "\u{00f6}", // odieresis + "\xf7" => "\u{00f7}", // divide + "\xf8" => "\u{00f8}", // oslash + "\xf9" => "\u{00f9}", // ugrave + "\xfa" => "\u{00fa}", // uacute + "\xfb" => "\u{00fb}", // ucircumflex + "\xfc" => "\u{00fc}", // udieresis + "\xfd" => "\u{00fd}", // yacute + "\xfe" => "\u{00fe}", // thorn + "\xff" => "\u{00ff}", // ydieresis + ]; + } + + public static function convertPDFDoc2UTF8(string $content): string + { + return strtr($content, static::getCodePage()); + } +} diff --git a/src/Smalot/PdfParser/Encoding/PostScriptGlyphs.php b/src/Smalot/PdfParser/Encoding/PostScriptGlyphs.php new file mode 100644 index 00000000..fbe1af4b --- /dev/null +++ b/src/Smalot/PdfParser/Encoding/PostScriptGlyphs.php @@ -0,0 +1,1099 @@ + + * + * @date 2019-09-17 + * + * @license LGPLv3 + * + * @url + * + * PdfParser is a pdf library written in PHP, extraction oriented. + * Copyright (C) 2017 - Sébastien MALOT + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program. + * If not, see . + */ + +namespace Smalot\PdfParser\Encoding; + +/** + * Class PostScriptGlyphs + */ +class PostScriptGlyphs +{ + /** + * The mapping tables have been converted from https://github.com/OpenPrinting/cups-filters/blob/master/fontembed/aglfn13.c, + * part of the OpenPrinting/cups-filters package, which itself is licensed under the MIT license and lists this specific code part as: + * Copyright 2008,2012 Tobias Hoffmann under the Expat license (https://www.gnu.org/licenses/license-list.html#Expat) + */ + public static function getGlyphs(): array + { + return [ + 'space' => '0x00a0', + 'exclam' => '0x0021', + 'quotedbl' => '0x0022', + 'numbersign' => '0x0023', + 'dollar' => '0x0024', + 'percent' => '0x0025', + 'ampersand' => '0x0026', + 'quotesingle' => '0x0027', + 'parenleft' => '0x0028', + 'parenright' => '0x0029', + 'asterisk' => '0x002a', + 'plus' => '0x002b', + 'comma' => '0x002c', + 'hyphen' => '0x002d', + 'period' => '0x002e', + 'slash' => '0x002f', + 'zero' => '0x0030', + 'one' => '0x0031', + 'two' => '0x0032', + 'three' => '0x0033', + 'four' => '0x0034', + 'five' => '0x0035', + 'six' => '0x0036', + 'seven' => '0x0037', + 'eight' => '0x0038', + 'nine' => '0x0039', + 'colon' => '0x003a', + 'semicolon' => '0x003b', + 'less' => '0x003c', + 'equal' => '0x003d', + 'greater' => '0x003e', + 'question' => '0x003f', + 'at' => '0x0040', + 'A' => '0x0041', + 'B' => '0x0042', + 'C' => '0x0043', + 'D' => '0x0044', + 'E' => '0x0045', + 'F' => '0x0046', + 'G' => '0x0047', + 'H' => '0x0048', + 'I' => '0x0049', + 'J' => '0x004a', + 'K' => '0x004b', + 'L' => '0x004c', + 'M' => '0x004d', + 'N' => '0x004e', + 'O' => '0x004f', + 'P' => '0x0050', + 'Q' => '0x0051', + 'R' => '0x0052', + 'S' => '0x0053', + 'T' => '0x0054', + 'U' => '0x0055', + 'V' => '0x0056', + 'W' => '0x0057', + 'X' => '0x0058', + 'Y' => '0x0059', + 'Z' => '0x005a', + 'bracketleft' => '0x005b', + 'backslash' => '0x005c', + 'bracketright' => '0x005d', + 'asciicircum' => '0x005e', + 'underscore' => '0x005f', + 'grave' => '0x0060', + 'a' => '0x0061', + 'b' => '0x0062', + 'c' => '0x0063', + 'd' => '0x0064', + 'e' => '0x0065', + 'f' => '0x0066', + 'g' => '0x0067', + 'h' => '0x0068', + 'i' => '0x0069', + 'j' => '0x006a', + 'k' => '0x006b', + 'l' => '0x006c', + 'm' => '0x006d', + 'n' => '0x006e', + 'o' => '0x006f', + 'p' => '0x0070', + 'q' => '0x0071', + 'r' => '0x0072', + 's' => '0x0073', + 't' => '0x0074', + 'u' => '0x0075', + 'v' => '0x0076', + 'w' => '0x0077', + 'x' => '0x0078', + 'y' => '0x0079', + 'z' => '0x007a', + 'braceleft' => '0x007b', + 'bar' => '0x007c', + 'braceright' => '0x007d', + 'asciitilde' => '0x007e', + 'exclamdown' => '0x00a1', + 'cent' => '0x00a2', + 'sterling' => '0x00a3', + 'currency' => '0x00a4', + 'yen' => '0x00a5', + 'brokenbar' => '0x00a6', + 'section' => '0x00a7', + 'dieresis' => '0x00a8', + 'copyright' => '0x00a9', + 'ordfeminine' => '0x00aa', + 'guillemotleft' => '0x00ab', + 'logicalnot' => '0x00ac', + 'minus' => '0x2212', + 'registered' => '0x00ae', + 'macron' => '0x02c9', + 'degree' => '0x00b0', + 'plusminus' => '0x00b1', + 'twosuperior' => '0x00b2', + 'threesuperior' => '0x00b3', + 'acute' => '0x00b4', + 'mu' => '0x03bc', + 'paragraph' => '0x00b6', + 'periodcentered' => '0x2219', + 'cedilla' => '0x00b8', + 'onesuperior' => '0x00b9', + 'ordmasculine' => '0x00ba', + 'guillemotright' => '0x00bb', + 'onequarter' => '0x00bc', + 'onehalf' => '0x00bd', + 'threequarters' => '0x00be', + 'questiondown' => '0x00bf', + 'Agrave' => '0x00c0', + 'Aacute' => '0x00c1', + 'Acircumflex' => '0x00c2', + 'Atilde' => '0x00c3', + 'Adieresis' => '0x00c4', + 'Aring' => '0x00c5', + 'AE' => '0x00c6', + 'Ccedilla' => '0x00c7', + 'Egrave' => '0x00c8', + 'Eacute' => '0x00c9', + 'Ecircumflex' => '0x00ca', + 'Edieresis' => '0x00cb', + 'Igrave' => '0x00cc', + 'Iacute' => '0x00cd', + 'Icircumflex' => '0x00ce', + 'Idieresis' => '0x00cf', + 'Eth' => '0x00d0', + 'Ntilde' => '0x00d1', + 'Ograve' => '0x00d2', + 'Oacute' => '0x00d3', + 'Ocircumflex' => '0x00d4', + 'Otilde' => '0x00d5', + 'Odieresis' => '0x00d6', + 'multiply' => '0x00d7', + 'Oslash' => '0x00d8', + 'Ugrave' => '0x00d9', + 'Uacute' => '0x00da', + 'Ucircumflex' => '0x00db', + 'Udieresis' => '0x00dc', + 'Yacute' => '0x00dd', + 'Thorn' => '0x00de', + 'germandbls' => '0x00df', + 'agrave' => '0x00e0', + 'aacute' => '0x00e1', + 'acircumflex' => '0x00e2', + 'atilde' => '0x00e3', + 'adieresis' => '0x00e4', + 'aring' => '0x00e5', + 'ae' => '0x00e6', + 'ccedilla' => '0x00e7', + 'egrave' => '0x00e8', + 'eacute' => '0x00e9', + 'ecircumflex' => '0x00ea', + 'edieresis' => '0x00eb', + 'igrave' => '0x00ec', + 'iacute' => '0x00ed', + 'icircumflex' => '0x00ee', + 'idieresis' => '0x00ef', + 'eth' => '0x00f0', + 'ntilde' => '0x00f1', + 'ograve' => '0x00f2', + 'oacute' => '0x00f3', + 'ocircumflex' => '0x00f4', + 'otilde' => '0x00f5', + 'odieresis' => '0x00f6', + 'divide' => '0x00f7', + 'oslash' => '0x00f8', + 'ugrave' => '0x00f9', + 'uacute' => '0x00fa', + 'ucircumflex' => '0x00fb', + 'udieresis' => '0x00fc', + 'yacute' => '0x00fd', + 'thorn' => '0x00fe', + 'ydieresis' => '0x00ff', + 'Amacron' => '0x0100', + 'amacron' => '0x0101', + 'Abreve' => '0x0102', + 'abreve' => '0x0103', + 'Aogonek' => '0x0104', + 'aogonek' => '0x0105', + 'Cacute' => '0x0106', + 'cacute' => '0x0107', + 'Ccircumflex' => '0x0108', + 'ccircumflex' => '0x0109', + 'Cdotaccent' => '0x010a', + 'cdotaccent' => '0x010b', + 'Ccaron' => '0x010c', + 'ccaron' => '0x010d', + 'Dcaron' => '0x010e', + 'dcaron' => '0x010f', + 'Dcroat' => '0x0110', + 'dcroat' => '0x0111', + 'Emacron' => '0x0112', + 'emacron' => '0x0113', + 'Ebreve' => '0x0114', + 'ebreve' => '0x0115', + 'Edotaccent' => '0x0116', + 'edotaccent' => '0x0117', + 'Eogonek' => '0x0118', + 'eogonek' => '0x0119', + 'Ecaron' => '0x011a', + 'ecaron' => '0x011b', + 'Gcircumflex' => '0x011c', + 'gcircumflex' => '0x011d', + 'Gbreve' => '0x011e', + 'gbreve' => '0x011f', + 'Gdotaccent' => '0x0120', + 'gdotaccent' => '0x0121', + 'Gcommaaccent' => '0x0122', + 'gcommaaccent' => '0x0123', + 'Hcircumflex' => '0x0124', + 'hcircumflex' => '0x0125', + 'Hbar' => '0x0126', + 'hbar' => '0x0127', + 'Itilde' => '0x0128', + 'itilde' => '0x0129', + 'Imacron' => '0x012a', + 'imacron' => '0x012b', + 'Ibreve' => '0x012c', + 'ibreve' => '0x012d', + 'Iogonek' => '0x012e', + 'iogonek' => '0x012f', + 'Idotaccent' => '0x0130', + 'dotlessi' => '0x0131', + 'IJ' => '0x0132', + 'ij' => '0x0133', + 'Jcircumflex' => '0x0134', + 'jcircumflex' => '0x0135', + 'Kcommaaccent' => '0x0136', + 'kcommaaccent' => '0x0137', + 'kgreenlandic' => '0x0138', + 'Lacute' => '0x0139', + 'lacute' => '0x013a', + 'Lcommaaccent' => '0x013b', + 'lcommaaccent' => '0x013c', + 'Lcaron' => '0x013d', + 'lcaron' => '0x013e', + 'Ldot' => '0x013f', + 'ldot' => '0x0140', + 'Lslash' => '0x0141', + 'lslash' => '0x0142', + 'Nacute' => '0x0143', + 'nacute' => '0x0144', + 'Ncommaaccent' => '0x0145', + 'ncommaaccent' => '0x0146', + 'Ncaron' => '0x0147', + 'ncaron' => '0x0148', + 'napostrophe' => '0x0149', + 'Eng' => '0x014a', + 'eng' => '0x014b', + 'Omacron' => '0x014c', + 'omacron' => '0x014d', + 'Obreve' => '0x014e', + 'obreve' => '0x014f', + 'Ohungarumlaut' => '0x0150', + 'ohungarumlaut' => '0x0151', + 'OE' => '0x0152', + 'oe' => '0x0153', + 'Racute' => '0x0154', + 'racute' => '0x0155', + 'Rcommaaccent' => '0x0156', + 'rcommaaccent' => '0x0157', + 'Rcaron' => '0x0158', + 'rcaron' => '0x0159', + 'Sacute' => '0x015a', + 'sacute' => '0x015b', + 'Scircumflex' => '0x015c', + 'scircumflex' => '0x015d', + 'Scedilla' => '0xf6c1', + 'scedilla' => '0xf6c2', + 'Scaron' => '0x0160', + 'scaron' => '0x0161', + 'Tcommaaccent' => '0x021a', + 'tcommaaccent' => '0x021b', + 'Tcaron' => '0x0164', + 'tcaron' => '0x0165', + 'Tbar' => '0x0166', + 'tbar' => '0x0167', + 'Utilde' => '0x0168', + 'utilde' => '0x0169', + 'Umacron' => '0x016a', + 'umacron' => '0x016b', + 'Ubreve' => '0x016c', + 'ubreve' => '0x016d', + 'Uring' => '0x016e', + 'uring' => '0x016f', + 'Uhungarumlaut' => '0x0170', + 'uhungarumlaut' => '0x0171', + 'Uogonek' => '0x0172', + 'uogonek' => '0x0173', + 'Wcircumflex' => '0x0174', + 'wcircumflex' => '0x0175', + 'Ycircumflex' => '0x0176', + 'ycircumflex' => '0x0177', + 'Ydieresis' => '0x0178', + 'Zacute' => '0x0179', + 'zacute' => '0x017a', + 'Zdotaccent' => '0x017b', + 'zdotaccent' => '0x017c', + 'Zcaron' => '0x017d', + 'zcaron' => '0x017e', + 'longs' => '0x017f', + 'florin' => '0x0192', + 'Ohorn' => '0x01a0', + 'ohorn' => '0x01a1', + 'Uhorn' => '0x01af', + 'uhorn' => '0x01b0', + 'Gcaron' => '0x01e6', + 'gcaron' => '0x01e7', + 'Aringacute' => '0x01fa', + 'aringacute' => '0x01fb', + 'AEacute' => '0x01fc', + 'aeacute' => '0x01fd', + 'Oslashacute' => '0x01fe', + 'oslashacute' => '0x01ff', + 'Scommaaccent' => '0x0218', + 'scommaaccent' => '0x0219', + 'afii57929' => '0x02bc', + 'afii64937' => '0x02bd', + 'circumflex' => '0x02c6', + 'caron' => '0x02c7', + 'breve' => '0x02d8', + 'dotaccent' => '0x02d9', + 'ring' => '0x02da', + 'ogonek' => '0x02db', + 'tilde' => '0x02dc', + 'hungarumlaut' => '0x02dd', + 'gravecomb' => '0x0300', + 'acutecomb' => '0x0301', + 'tildecomb' => '0x0303', + 'hookabovecomb' => '0x0309', + 'dotbelowcomb' => '0x0323', + 'tonos' => '0x0384', + 'dieresistonos' => '0x0385', + 'Alphatonos' => '0x0386', + 'anoteleia' => '0x0387', + 'Epsilontonos' => '0x0388', + 'Etatonos' => '0x0389', + 'Iotatonos' => '0x038a', + 'Omicrontonos' => '0x038c', + 'Upsilontonos' => '0x038e', + 'Omegatonos' => '0x038f', + 'iotadieresistonos' => '0x0390', + 'Alpha' => '0x0391', + 'Beta' => '0x0392', + 'Gamma' => '0x0393', + 'Delta' => '0x2206', + 'Epsilon' => '0x0395', + 'Zeta' => '0x0396', + 'Eta' => '0x0397', + 'Theta' => '0x0398', + 'Iota' => '0x0399', + 'Kappa' => '0x039a', + 'Lambda' => '0x039b', + 'Mu' => '0x039c', + 'Nu' => '0x039d', + 'Xi' => '0x039e', + 'Omicron' => '0x039f', + 'Pi' => '0x03a0', + 'Rho' => '0x03a1', + 'Sigma' => '0x03a3', + 'Tau' => '0x03a4', + 'Upsilon' => '0x03a5', + 'Phi' => '0x03a6', + 'Chi' => '0x03a7', + 'Psi' => '0x03a8', + 'Omega' => '0x2126', + 'Iotadieresis' => '0x03aa', + 'Upsilondieresis' => '0x03ab', + 'alphatonos' => '0x03ac', + 'epsilontonos' => '0x03ad', + 'etatonos' => '0x03ae', + 'iotatonos' => '0x03af', + 'upsilondieresistonos' => '0x03b0', + 'alpha' => '0x03b1', + 'beta' => '0x03b2', + 'gamma' => '0x03b3', + 'delta' => '0x03b4', + 'epsilon' => '0x03b5', + 'zeta' => '0x03b6', + 'eta' => '0x03b7', + 'theta' => '0x03b8', + 'iota' => '0x03b9', + 'kappa' => '0x03ba', + 'lambda' => '0x03bb', + 'nu' => '0x03bd', + 'xi' => '0x03be', + 'omicron' => '0x03bf', + 'pi' => '0x03c0', + 'rho' => '0x03c1', + 'sigma1' => '0x03c2', + 'sigma' => '0x03c3', + 'tau' => '0x03c4', + 'upsilon' => '0x03c5', + 'phi' => '0x03c6', + 'chi' => '0x03c7', + 'psi' => '0x03c8', + 'omega' => '0x03c9', + 'iotadieresis' => '0x03ca', + 'upsilondieresis' => '0x03cb', + 'omicrontonos' => '0x03cc', + 'upsilontonos' => '0x03cd', + 'omegatonos' => '0x03ce', + 'theta1' => '0x03d1', + 'Upsilon1' => '0x03d2', + 'phi1' => '0x03d5', + 'omega1' => '0x03d6', + 'afii10023' => '0x0401', + 'afii10051' => '0x0402', + 'afii10052' => '0x0403', + 'afii10053' => '0x0404', + 'afii10054' => '0x0405', + 'afii10055' => '0x0406', + 'afii10056' => '0x0407', + 'afii10057' => '0x0408', + 'afii10058' => '0x0409', + 'afii10059' => '0x040a', + 'afii10060' => '0x040b', + 'afii10061' => '0x040c', + 'afii10062' => '0x040e', + 'afii10145' => '0x040f', + 'afii10017' => '0x0410', + 'afii10018' => '0x0411', + 'afii10019' => '0x0412', + 'afii10020' => '0x0413', + 'afii10021' => '0x0414', + 'afii10022' => '0x0415', + 'afii10024' => '0x0416', + 'afii10025' => '0x0417', + 'afii10026' => '0x0418', + 'afii10027' => '0x0419', + 'afii10028' => '0x041a', + 'afii10029' => '0x041b', + 'afii10030' => '0x041c', + 'afii10031' => '0x041d', + 'afii10032' => '0x041e', + 'afii10033' => '0x041f', + 'afii10034' => '0x0420', + 'afii10035' => '0x0421', + 'afii10036' => '0x0422', + 'afii10037' => '0x0423', + 'afii10038' => '0x0424', + 'afii10039' => '0x0425', + 'afii10040' => '0x0426', + 'afii10041' => '0x0427', + 'afii10042' => '0x0428', + 'afii10043' => '0x0429', + 'afii10044' => '0x042a', + 'afii10045' => '0x042b', + 'afii10046' => '0x042c', + 'afii10047' => '0x042d', + 'afii10048' => '0x042e', + 'afii10049' => '0x042f', + 'afii10065' => '0x0430', + 'afii10066' => '0x0431', + 'afii10067' => '0x0432', + 'afii10068' => '0x0433', + 'afii10069' => '0x0434', + 'afii10070' => '0x0435', + 'afii10072' => '0x0436', + 'afii10073' => '0x0437', + 'afii10074' => '0x0438', + 'afii10075' => '0x0439', + 'afii10076' => '0x043a', + 'afii10077' => '0x043b', + 'afii10078' => '0x043c', + 'afii10079' => '0x043d', + 'afii10080' => '0x043e', + 'afii10081' => '0x043f', + 'afii10082' => '0x0440', + 'afii10083' => '0x0441', + 'afii10084' => '0x0442', + 'afii10085' => '0x0443', + 'afii10086' => '0x0444', + 'afii10087' => '0x0445', + 'afii10088' => '0x0446', + 'afii10089' => '0x0447', + 'afii10090' => '0x0448', + 'afii10091' => '0x0449', + 'afii10092' => '0x044a', + 'afii10093' => '0x044b', + 'afii10094' => '0x044c', + 'afii10095' => '0x044d', + 'afii10096' => '0x044e', + 'afii10097' => '0x044f', + 'afii10071' => '0x0451', + 'afii10099' => '0x0452', + 'afii10100' => '0x0453', + 'afii10101' => '0x0454', + 'afii10102' => '0x0455', + 'afii10103' => '0x0456', + 'afii10104' => '0x0457', + 'afii10105' => '0x0458', + 'afii10106' => '0x0459', + 'afii10107' => '0x045a', + 'afii10108' => '0x045b', + 'afii10109' => '0x045c', + 'afii10110' => '0x045e', + 'afii10193' => '0x045f', + 'afii10146' => '0x0462', + 'afii10194' => '0x0463', + 'afii10147' => '0x0472', + 'afii10195' => '0x0473', + 'afii10148' => '0x0474', + 'afii10196' => '0x0475', + 'afii10050' => '0x0490', + 'afii10098' => '0x0491', + 'afii10846' => '0x04d9', + 'afii57799' => '0x05b0', + 'afii57801' => '0x05b1', + 'afii57800' => '0x05b2', + 'afii57802' => '0x05b3', + 'afii57793' => '0x05b4', + 'afii57794' => '0x05b5', + 'afii57795' => '0x05b6', + 'afii57798' => '0x05b7', + 'afii57797' => '0x05b8', + 'afii57806' => '0x05b9', + 'afii57796' => '0x05bb', + 'afii57807' => '0x05bc', + 'afii57839' => '0x05bd', + 'afii57645' => '0x05be', + 'afii57841' => '0x05bf', + 'afii57842' => '0x05c0', + 'afii57804' => '0x05c1', + 'afii57803' => '0x05c2', + 'afii57658' => '0x05c3', + 'afii57664' => '0x05d0', + 'afii57665' => '0x05d1', + 'afii57666' => '0x05d2', + 'afii57667' => '0x05d3', + 'afii57668' => '0x05d4', + 'afii57669' => '0x05d5', + 'afii57670' => '0x05d6', + 'afii57671' => '0x05d7', + 'afii57672' => '0x05d8', + 'afii57673' => '0x05d9', + 'afii57674' => '0x05da', + 'afii57675' => '0x05db', + 'afii57676' => '0x05dc', + 'afii57677' => '0x05dd', + 'afii57678' => '0x05de', + 'afii57679' => '0x05df', + 'afii57680' => '0x05e0', + 'afii57681' => '0x05e1', + 'afii57682' => '0x05e2', + 'afii57683' => '0x05e3', + 'afii57684' => '0x05e4', + 'afii57685' => '0x05e5', + 'afii57686' => '0x05e6', + 'afii57687' => '0x05e7', + 'afii57688' => '0x05e8', + 'afii57689' => '0x05e9', + 'afii57690' => '0x05ea', + 'afii57716' => '0x05f0', + 'afii57717' => '0x05f1', + 'afii57718' => '0x05f2', + 'afii57388' => '0x060c', + 'afii57403' => '0x061b', + 'afii57407' => '0x061f', + 'afii57409' => '0x0621', + 'afii57410' => '0x0622', + 'afii57411' => '0x0623', + 'afii57412' => '0x0624', + 'afii57413' => '0x0625', + 'afii57414' => '0x0626', + 'afii57415' => '0x0627', + 'afii57416' => '0x0628', + 'afii57417' => '0x0629', + 'afii57418' => '0x062a', + 'afii57419' => '0x062b', + 'afii57420' => '0x062c', + 'afii57421' => '0x062d', + 'afii57422' => '0x062e', + 'afii57423' => '0x062f', + 'afii57424' => '0x0630', + 'afii57425' => '0x0631', + 'afii57426' => '0x0632', + 'afii57427' => '0x0633', + 'afii57428' => '0x0634', + 'afii57429' => '0x0635', + 'afii57430' => '0x0636', + 'afii57431' => '0x0637', + 'afii57432' => '0x0638', + 'afii57433' => '0x0639', + 'afii57434' => '0x063a', + 'afii57440' => '0x0640', + 'afii57441' => '0x0641', + 'afii57442' => '0x0642', + 'afii57443' => '0x0643', + 'afii57444' => '0x0644', + 'afii57445' => '0x0645', + 'afii57446' => '0x0646', + 'afii57470' => '0x0647', + 'afii57448' => '0x0648', + 'afii57449' => '0x0649', + 'afii57450' => '0x064a', + 'afii57451' => '0x064b', + 'afii57452' => '0x064c', + 'afii57453' => '0x064d', + 'afii57454' => '0x064e', + 'afii57455' => '0x064f', + 'afii57456' => '0x0650', + 'afii57457' => '0x0651', + 'afii57458' => '0x0652', + 'afii57392' => '0x0660', + 'afii57393' => '0x0661', + 'afii57394' => '0x0662', + 'afii57395' => '0x0663', + 'afii57396' => '0x0664', + 'afii57397' => '0x0665', + 'afii57398' => '0x0666', + 'afii57399' => '0x0667', + 'afii57400' => '0x0668', + 'afii57401' => '0x0669', + 'afii57381' => '0x066a', + 'afii63167' => '0x066d', + 'afii57511' => '0x0679', + 'afii57506' => '0x067e', + 'afii57507' => '0x0686', + 'afii57512' => '0x0688', + 'afii57513' => '0x0691', + 'afii57508' => '0x0698', + 'afii57505' => '0x06a4', + 'afii57509' => '0x06af', + 'afii57514' => '0x06ba', + 'afii57519' => '0x06d2', + 'afii57534' => '0x06d5', + 'Wgrave' => '0x1e80', + 'wgrave' => '0x1e81', + 'Wacute' => '0x1e82', + 'wacute' => '0x1e83', + 'Wdieresis' => '0x1e84', + 'wdieresis' => '0x1e85', + 'Ygrave' => '0x1ef2', + 'ygrave' => '0x1ef3', + 'afii61664' => '0x200c', + 'afii301' => '0x200d', + 'afii299' => '0x200e', + 'afii300' => '0x200f', + 'figuredash' => '0x2012', + 'endash' => '0x2013', + 'emdash' => '0x2014', + 'afii00208' => '0x2015', + 'underscoredbl' => '0x2017', + 'quoteleft' => '0x2018', + 'quoteright' => '0x2019', + 'quotesinglbase' => '0x201a', + 'quotereversed' => '0x201b', + 'quotedblleft' => '0x201c', + 'quotedblright' => '0x201d', + 'quotedblbase' => '0x201e', + 'dagger' => '0x2020', + 'daggerdbl' => '0x2021', + 'bullet' => '0x2022', + 'onedotenleader' => '0x2024', + 'twodotenleader' => '0x2025', + 'ellipsis' => '0x2026', + 'afii61573' => '0x202c', + 'afii61574' => '0x202d', + 'afii61575' => '0x202e', + 'perthousand' => '0x2030', + 'minute' => '0x2032', + 'second' => '0x2033', + 'guilsinglleft' => '0x2039', + 'guilsinglright' => '0x203a', + 'exclamdbl' => '0x203c', + 'fraction' => '0x2215', + 'zerosuperior' => '0x2070', + 'foursuperior' => '0x2074', + 'fivesuperior' => '0x2075', + 'sixsuperior' => '0x2076', + 'sevensuperior' => '0x2077', + 'eightsuperior' => '0x2078', + 'ninesuperior' => '0x2079', + 'parenleftsuperior' => '0x207d', + 'parenrightsuperior' => '0x207e', + 'nsuperior' => '0x207f', + 'zeroinferior' => '0x2080', + 'oneinferior' => '0x2081', + 'twoinferior' => '0x2082', + 'threeinferior' => '0x2083', + 'fourinferior' => '0x2084', + 'fiveinferior' => '0x2085', + 'sixinferior' => '0x2086', + 'seveninferior' => '0x2087', + 'eightinferior' => '0x2088', + 'nineinferior' => '0x2089', + 'parenleftinferior' => '0x208d', + 'parenrightinferior' => '0x208e', + 'colonmonetary' => '0x20a1', + 'franc' => '0x20a3', + 'lira' => '0x20a4', + 'peseta' => '0x20a7', + 'afii57636' => '0x20aa', + 'dong' => '0x20ab', + 'Euro' => '0x20ac', + 'afii61248' => '0x2105', + 'Ifraktur' => '0x2111', + 'afii61289' => '0x2113', + 'afii61352' => '0x2116', + 'weierstrass' => '0x2118', + 'Rfraktur' => '0x211c', + 'prescription' => '0x211e', + 'trademark' => '0x2122', + 'estimated' => '0x212e', + 'aleph' => '0x2135', + 'onethird' => '0x2153', + 'twothirds' => '0x2154', + 'oneeighth' => '0x215b', + 'threeeighths' => '0x215c', + 'fiveeighths' => '0x215d', + 'seveneighths' => '0x215e', + 'arrowleft' => '0x2190', + 'arrowup' => '0x2191', + 'arrowright' => '0x2192', + 'arrowdown' => '0x2193', + 'arrowboth' => '0x2194', + 'arrowupdn' => '0x2195', + 'arrowupdnbse' => '0x21a8', + 'carriagereturn' => '0x21b5', + 'arrowdblleft' => '0x21d0', + 'arrowdblup' => '0x21d1', + 'arrowdblright' => '0x21d2', + 'arrowdbldown' => '0x21d3', + 'arrowdblboth' => '0x21d4', + 'universal' => '0x2200', + 'partialdiff' => '0x2202', + 'existential' => '0x2203', + 'emptyset' => '0x2205', + 'gradient' => '0x2207', + 'element' => '0x2208', + 'notelement' => '0x2209', + 'suchthat' => '0x220b', + 'product' => '0x220f', + 'summation' => '0x2211', + 'asteriskmath' => '0x2217', + 'radical' => '0x221a', + 'proportional' => '0x221d', + 'infinity' => '0x221e', + 'orthogonal' => '0x221f', + 'angle' => '0x2220', + 'logicaland' => '0x2227', + 'logicalor' => '0x2228', + 'intersection' => '0x2229', + 'union' => '0x222a', + 'integral' => '0x222b', + 'therefore' => '0x2234', + 'similar' => '0x223c', + 'congruent' => '0x2245', + 'approxequal' => '0x2248', + 'notequal' => '0x2260', + 'equivalence' => '0x2261', + 'lessequal' => '0x2264', + 'greaterequal' => '0x2265', + 'propersubset' => '0x2282', + 'propersuperset' => '0x2283', + 'notsubset' => '0x2284', + 'reflexsubset' => '0x2286', + 'reflexsuperset' => '0x2287', + 'circleplus' => '0x2295', + 'circlemultiply' => '0x2297', + 'perpendicular' => '0x22a5', + 'dotmath' => '0x22c5', + 'house' => '0x2302', + 'revlogicalnot' => '0x2310', + 'integraltp' => '0x2320', + 'integralbt' => '0x2321', + 'angleleft' => '0x2329', + 'angleright' => '0x232a', + 'SF100000' => '0x2500', + 'SF110000' => '0x2502', + 'SF010000' => '0x250c', + 'SF030000' => '0x2510', + 'SF020000' => '0x2514', + 'SF040000' => '0x2518', + 'SF080000' => '0x251c', + 'SF090000' => '0x2524', + 'SF060000' => '0x252c', + 'SF070000' => '0x2534', + 'SF050000' => '0x253c', + 'SF430000' => '0x2550', + 'SF240000' => '0x2551', + 'SF510000' => '0x2552', + 'SF520000' => '0x2553', + 'SF390000' => '0x2554', + 'SF220000' => '0x2555', + 'SF210000' => '0x2556', + 'SF250000' => '0x2557', + 'SF500000' => '0x2558', + 'SF490000' => '0x2559', + 'SF380000' => '0x255a', + 'SF280000' => '0x255b', + 'SF270000' => '0x255c', + 'SF260000' => '0x255d', + 'SF360000' => '0x255e', + 'SF370000' => '0x255f', + 'SF420000' => '0x2560', + 'SF190000' => '0x2561', + 'SF200000' => '0x2562', + 'SF230000' => '0x2563', + 'SF470000' => '0x2564', + 'SF480000' => '0x2565', + 'SF410000' => '0x2566', + 'SF450000' => '0x2567', + 'SF460000' => '0x2568', + 'SF400000' => '0x2569', + 'SF540000' => '0x256a', + 'SF530000' => '0x256b', + 'SF440000' => '0x256c', + 'upblock' => '0x2580', + 'dnblock' => '0x2584', + 'block' => '0x2588', + 'lfblock' => '0x258c', + 'rtblock' => '0x2590', + 'ltshade' => '0x2591', + 'shade' => '0x2592', + 'dkshade' => '0x2593', + 'filledbox' => '0x25a0', + 'H22073' => '0x25a1', + 'H18543' => '0x25aa', + 'H18551' => '0x25ab', + 'filledrect' => '0x25ac', + 'triagup' => '0x25b2', + 'triagrt' => '0x25ba', + 'triagdn' => '0x25bc', + 'triaglf' => '0x25c4', + 'lozenge' => '0x25ca', + 'circle' => '0x25cb', + 'H18533' => '0x25cf', + 'invbullet' => '0x25d8', + 'invcircle' => '0x25d9', + 'openbullet' => '0x25e6', + 'smileface' => '0x263a', + 'invsmileface' => '0x263b', + 'sun' => '0x263c', + 'female' => '0x2640', + 'male' => '0x2642', + 'spade' => '0x2660', + 'club' => '0x2663', + 'heart' => '0x2665', + 'diamond' => '0x2666', + 'musicalnote' => '0x266a', + 'musicalnotedbl' => '0x266b', + 'dotlessj' => '0xf6be', + 'LL' => '0xf6bf', + 'll' => '0xf6c0', + 'commaaccent' => '0xf6c3', + 'afii10063' => '0xf6c4', + 'afii10064' => '0xf6c5', + 'afii10192' => '0xf6c6', + 'afii10831' => '0xf6c7', + 'afii10832' => '0xf6c8', + 'Acute' => '0xf6c9', + 'Caron' => '0xf6ca', + 'Dieresis' => '0xf6cb', + 'DieresisAcute' => '0xf6cc', + 'DieresisGrave' => '0xf6cd', + 'Grave' => '0xf6ce', + 'Hungarumlaut' => '0xf6cf', + 'Macron' => '0xf6d0', + 'cyrBreve' => '0xf6d1', + 'cyrFlex' => '0xf6d2', + 'dblGrave' => '0xf6d3', + 'cyrbreve' => '0xf6d4', + 'cyrflex' => '0xf6d5', + 'dblgrave' => '0xf6d6', + 'dieresisacute' => '0xf6d7', + 'dieresisgrave' => '0xf6d8', + 'copyrightserif' => '0xf6d9', + 'registerserif' => '0xf6da', + 'trademarkserif' => '0xf6db', + 'onefitted' => '0xf6dc', + 'rupiah' => '0xf6dd', + 'threequartersemdash' => '0xf6de', + 'centinferior' => '0xf6df', + 'centsuperior' => '0xf6e0', + 'commainferior' => '0xf6e1', + 'commasuperior' => '0xf6e2', + 'dollarinferior' => '0xf6e3', + 'dollarsuperior' => '0xf6e4', + 'hypheninferior' => '0xf6e5', + 'hyphensuperior' => '0xf6e6', + 'periodinferior' => '0xf6e7', + 'periodsuperior' => '0xf6e8', + 'asuperior' => '0xf6e9', + 'bsuperior' => '0xf6ea', + 'dsuperior' => '0xf6eb', + 'esuperior' => '0xf6ec', + 'isuperior' => '0xf6ed', + 'lsuperior' => '0xf6ee', + 'msuperior' => '0xf6ef', + 'osuperior' => '0xf6f0', + 'rsuperior' => '0xf6f1', + 'ssuperior' => '0xf6f2', + 'tsuperior' => '0xf6f3', + 'Brevesmall' => '0xf6f4', + 'Caronsmall' => '0xf6f5', + 'Circumflexsmall' => '0xf6f6', + 'Dotaccentsmall' => '0xf6f7', + 'Hungarumlautsmall' => '0xf6f8', + 'Lslashsmall' => '0xf6f9', + 'OEsmall' => '0xf6fa', + 'Ogoneksmall' => '0xf6fb', + 'Ringsmall' => '0xf6fc', + 'Scaronsmall' => '0xf6fd', + 'Tildesmall' => '0xf6fe', + 'Zcaronsmall' => '0xf6ff', + 'exclamsmall' => '0xf721', + 'dollaroldstyle' => '0xf724', + 'ampersandsmall' => '0xf726', + 'zerooldstyle' => '0xf730', + 'oneoldstyle' => '0xf731', + 'twooldstyle' => '0xf732', + 'threeoldstyle' => '0xf733', + 'fouroldstyle' => '0xf734', + 'fiveoldstyle' => '0xf735', + 'sixoldstyle' => '0xf736', + 'sevenoldstyle' => '0xf737', + 'eightoldstyle' => '0xf738', + 'nineoldstyle' => '0xf739', + 'questionsmall' => '0xf73f', + 'Gravesmall' => '0xf760', + 'Asmall' => '0xf761', + 'Bsmall' => '0xf762', + 'Csmall' => '0xf763', + 'Dsmall' => '0xf764', + 'Esmall' => '0xf765', + 'Fsmall' => '0xf766', + 'Gsmall' => '0xf767', + 'Hsmall' => '0xf768', + 'Ismall' => '0xf769', + 'Jsmall' => '0xf76a', + 'Ksmall' => '0xf76b', + 'Lsmall' => '0xf76c', + 'Msmall' => '0xf76d', + 'Nsmall' => '0xf76e', + 'Osmall' => '0xf76f', + 'Psmall' => '0xf770', + 'Qsmall' => '0xf771', + 'Rsmall' => '0xf772', + 'Ssmall' => '0xf773', + 'Tsmall' => '0xf774', + 'Usmall' => '0xf775', + 'Vsmall' => '0xf776', + 'Wsmall' => '0xf777', + 'Xsmall' => '0xf778', + 'Ysmall' => '0xf779', + 'Zsmall' => '0xf77a', + 'exclamdownsmall' => '0xf7a1', + 'centoldstyle' => '0xf7a2', + 'Dieresissmall' => '0xf7a8', + 'Macronsmall' => '0xf7af', + 'Acutesmall' => '0xf7b4', + 'Cedillasmall' => '0xf7b8', + 'questiondownsmall' => '0xf7bf', + 'Agravesmall' => '0xf7e0', + 'Aacutesmall' => '0xf7e1', + 'Acircumflexsmall' => '0xf7e2', + 'Atildesmall' => '0xf7e3', + 'Adieresissmall' => '0xf7e4', + 'Aringsmall' => '0xf7e5', + 'AEsmall' => '0xf7e6', + 'Ccedillasmall' => '0xf7e7', + 'Egravesmall' => '0xf7e8', + 'Eacutesmall' => '0xf7e9', + 'Ecircumflexsmall' => '0xf7ea', + 'Edieresissmall' => '0xf7eb', + 'Igravesmall' => '0xf7ec', + 'Iacutesmall' => '0xf7ed', + 'Icircumflexsmall' => '0xf7ee', + 'Idieresissmall' => '0xf7ef', + 'Ethsmall' => '0xf7f0', + 'Ntildesmall' => '0xf7f1', + 'Ogravesmall' => '0xf7f2', + 'Oacutesmall' => '0xf7f3', + 'Ocircumflexsmall' => '0xf7f4', + 'Otildesmall' => '0xf7f5', + 'Odieresissmall' => '0xf7f6', + 'Oslashsmall' => '0xf7f8', + 'Ugravesmall' => '0xf7f9', + 'Uacutesmall' => '0xf7fa', + 'Ucircumflexsmall' => '0xf7fb', + 'Udieresissmall' => '0xf7fc', + 'Yacutesmall' => '0xf7fd', + 'Thornsmall' => '0xf7fe', + 'Ydieresissmall' => '0xf7ff', + 'radicalex' => '0xf8e5', + 'arrowvertex' => '0xf8e6', + 'arrowhorizex' => '0xf8e7', + 'registersans' => '0xf8e8', + 'copyrightsans' => '0xf8e9', + 'trademarksans' => '0xf8ea', + 'parenlefttp' => '0xf8eb', + 'parenleftex' => '0xf8ec', + 'parenleftbt' => '0xf8ed', + 'bracketlefttp' => '0xf8ee', + 'bracketleftex' => '0xf8ef', + 'bracketleftbt' => '0xf8f0', + 'bracelefttp' => '0xf8f1', + 'braceleftmid' => '0xf8f2', + 'braceleftbt' => '0xf8f3', + 'braceex' => '0xf8f4', + 'integralex' => '0xf8f5', + 'parenrighttp' => '0xf8f6', + 'parenrightex' => '0xf8f7', + 'parenrightbt' => '0xf8f8', + 'bracketrighttp' => '0xf8f9', + 'bracketrightex' => '0xf8fa', + 'bracketrightbt' => '0xf8fb', + 'bracerighttp' => '0xf8fc', + 'bracerightmid' => '0xf8fd', + 'bracerightbt' => '0xf8fe', + 'ff' => '0xfb00', + 'fi' => '0xfb01', + 'fl' => '0xfb02', + 'ffi' => '0xfb03', + 'ffl' => '0xfb04', + 'afii57705' => '0xfb1f', + 'afii57694' => '0xfb2a', + 'afii57695' => '0xfb2b', + 'afii57723' => '0xfb35', + 'afii57700' => '0xfb4b', + ]; + } + + public static function getCodePoint($glyph): ?int + { + $glyphsMap = static::getGlyphs(); + + if (isset($glyphsMap[$glyph])) { + return hexdec($glyphsMap[$glyph]); + } + + return null; + } +} diff --git a/src/Smalot/PdfParser/Encoding/StandardEncoding.php b/src/Smalot/PdfParser/Encoding/StandardEncoding.php index c02db6b6..01d0a1c4 100644 --- a/src/Smalot/PdfParser/Encoding/StandardEncoding.php +++ b/src/Smalot/PdfParser/Encoding/StandardEncoding.php @@ -5,27 +5,29 @@ * This file is part of the PdfParser library. * * @author Sébastien MALOT - * @date 2013-08-08 - * @license GPL-3.0 + * + * @date 2017-01-03 + * + * @license LGPLv3 + * * @url * * PdfParser is a pdf library written in PHP, extraction oriented. - * Copyright (C) 2014 - Sébastien MALOT + * Copyright (C) 2017 - Sébastien MALOT * * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by + * it under the terms of the GNU Lesser General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. + * GNU Lesser General Public License for more details. * - * You should have received a copy of the GNU General Public License + * You should have received a copy of the GNU Lesser General Public License * along with this program. * If not, see . - * */ // Source : http://cpansearch.perl.org/src/JV/PostScript-Font-1.10.02/lib/PostScript/StandardEncoding.pm @@ -34,41 +36,39 @@ /** * Class StandardEncoding - * - * @package Smalot\PdfParser\Encoding */ -class StandardEncoding +class StandardEncoding extends AbstractEncoding { - public function getTranslations() + public function getTranslations(): array { $encoding = - '.notdef .notdef .notdef .notdef .notdef .notdef .notdef .notdef ' . - '.notdef .notdef .notdef .notdef .notdef .notdef .notdef .notdef ' . - '.notdef .notdef .notdef .notdef .notdef .notdef .notdef .notdef ' . - '.notdef .notdef .notdef .notdef .notdef .notdef .notdef .notdef ' . - 'space exclam quotedbl numbersign dollar percent ampersand quoteright ' . - 'parenleft parenright asterisk plus comma hyphen period slash zero ' . - 'one two three four five six seven eight nine colon semicolon less ' . - 'equal greater question at A B C D E F G H I J K L M N O P Q R S T U ' . - 'V W X Y Z bracketleft backslash bracketright asciicircum underscore ' . - 'quoteleft a b c d e f g h i j k l m n o p q r s t u v w x y z ' . - 'braceleft bar braceright asciitilde .notdef .notdef .notdef .notdef ' . - '.notdef .notdef .notdef .notdef .notdef .notdef .notdef .notdef ' . - '.notdef .notdef .notdef .notdef .notdef .notdef .notdef .notdef ' . - '.notdef .notdef .notdef .notdef .notdef .notdef .notdef .notdef ' . - '.notdef .notdef .notdef .notdef .notdef .notdef exclamdown cent ' . - 'sterling fraction yen florin section currency quotesingle ' . - 'quotedblleft guillemotleft guilsinglleft guilsinglright fi fl ' . - '.notdef endash dagger daggerdbl periodcentered .notdef paragraph ' . - 'bullet quotesinglbase quotedblbase quotedblright guillemotright ' . - 'ellipsis perthousand .notdef questiondown .notdef grave acute ' . - 'circumflex tilde macron breve dotaccent dieresis .notdef ring ' . - 'cedilla .notdef hungarumlaut ogonek caron emdash .notdef .notdef ' . - '.notdef .notdef .notdef .notdef .notdef .notdef .notdef .notdef ' . - '.notdef .notdef .notdef .notdef .notdef .notdef AE .notdef ' . - 'ordfeminine .notdef .notdef .notdef .notdef Lslash Oslash OE ' . - 'ordmasculine .notdef .notdef .notdef .notdef .notdef ae .notdef ' . - '.notdef .notdef dotlessi .notdef .notdef lslash oslash oe germandbls ' . + '.notdef .notdef .notdef .notdef .notdef .notdef .notdef .notdef '. + '.notdef .notdef .notdef .notdef .notdef .notdef .notdef .notdef '. + '.notdef .notdef .notdef .notdef .notdef .notdef .notdef .notdef '. + '.notdef .notdef .notdef .notdef .notdef .notdef .notdef .notdef '. + 'space exclam quotedbl numbersign dollar percent ampersand quoteright '. + 'parenleft parenright asterisk plus comma hyphen period slash zero '. + 'one two three four five six seven eight nine colon semicolon less '. + 'equal greater question at A B C D E F G H I J K L M N O P Q R S T U '. + 'V W X Y Z bracketleft backslash bracketright asciicircum underscore '. + 'quoteleft a b c d e f g h i j k l m n o p q r s t u v w x y z '. + 'braceleft bar braceright asciitilde .notdef .notdef .notdef .notdef '. + '.notdef .notdef .notdef .notdef .notdef .notdef .notdef .notdef '. + '.notdef .notdef .notdef .notdef .notdef .notdef .notdef .notdef '. + '.notdef .notdef .notdef .notdef .notdef .notdef .notdef .notdef '. + '.notdef .notdef .notdef .notdef .notdef .notdef exclamdown cent '. + 'sterling fraction yen florin section currency quotesingle '. + 'quotedblleft guillemotleft guilsinglleft guilsinglright fi fl '. + '.notdef endash dagger daggerdbl periodcentered .notdef paragraph '. + 'bullet quotesinglbase quotedblbase quotedblright guillemotright '. + 'ellipsis perthousand .notdef questiondown .notdef grave acute '. + 'circumflex tilde macron breve dotaccent dieresis .notdef ring '. + 'cedilla .notdef hungarumlaut ogonek caron emdash .notdef .notdef '. + '.notdef .notdef .notdef .notdef .notdef .notdef .notdef .notdef '. + '.notdef .notdef .notdef .notdef .notdef .notdef AE .notdef '. + 'ordfeminine .notdef .notdef .notdef .notdef Lslash Oslash OE '. + 'ordmasculine .notdef .notdef .notdef .notdef .notdef ae .notdef '. + '.notdef .notdef dotlessi .notdef .notdef lslash oslash oe germandbls '. '.notdef .notdef .notdef .notdef'; return explode(' ', $encoding); diff --git a/src/Smalot/PdfParser/Encoding/WinAnsiEncoding.php b/src/Smalot/PdfParser/Encoding/WinAnsiEncoding.php index 491a0cd0..1938f555 100644 --- a/src/Smalot/PdfParser/Encoding/WinAnsiEncoding.php +++ b/src/Smalot/PdfParser/Encoding/WinAnsiEncoding.php @@ -5,27 +5,29 @@ * This file is part of the PdfParser library. * * @author Sébastien MALOT - * @date 2013-08-08 - * @license GPL-3.0 + * + * @date 2017-01-03 + * + * @license LGPLv3 + * * @url * * PdfParser is a pdf library written in PHP, extraction oriented. - * Copyright (C) 2014 - Sébastien MALOT + * Copyright (C) 2017 - Sébastien MALOT * * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by + * it under the terms of the GNU Lesser General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. + * GNU Lesser General Public License for more details. * - * You should have received a copy of the GNU General Public License + * You should have received a copy of the GNU Lesser General Public License * along with this program. * If not, see . - * */ // Source : http://cpansearch.perl.org/src/JV/PostScript-Font-1.10.02/lib/PostScript/WinANSIEncoding.pm @@ -34,41 +36,39 @@ /** * Class WinAnsiEncoding - * - * @package Smalot\PdfParser\Encoding */ -class WinAnsiEncoding +class WinAnsiEncoding extends AbstractEncoding { - public function getTranslations() + public function getTranslations(): array { $encoding = - '.notdef .notdef .notdef .notdef .notdef .notdef .notdef .notdef ' . - '.notdef .notdef .notdef .notdef .notdef .notdef .notdef .notdef ' . - '.notdef .notdef .notdef .notdef .notdef .notdef .notdef .notdef ' . - '.notdef .notdef .notdef .notdef .notdef .notdef .notdef .notdef ' . - 'space exclam quotedbl numbersign dollar percent ampersand quotesingle ' . - 'parenleft parenright asterisk plus comma hyphen period slash zero one ' . - 'two three four five six seven eight nine colon semicolon less equal ' . - 'greater question at A B C D E F G H I J K L M N O P Q R S T U V W X ' . - 'Y Z bracketleft backslash bracketright asciicircum underscore ' . - 'grave a b c d e f g h i j k l m n o p q r s t u v w x y z ' . - 'braceleft bar braceright asciitilde bullet Euro bullet quotesinglbase ' . - 'florin quotedblbase ellipsis dagger daggerdbl circumflex perthousand ' . - 'Scaron guilsinglleft OE bullet Zcaron bullet bullet quoteleft quoteright ' . - 'quotedblleft quotedblright bullet endash emdash tilde trademark scaron ' . - 'guilsinglright oe bullet zcaron Ydieresis space exclamdown cent ' . - 'sterling currency yen brokenbar section dieresis copyright ' . - 'ordfeminine guillemotleft logicalnot hyphen registered macron degree ' . - 'plusminus twosuperior threesuperior acute mu paragraph ' . - 'periodcentered cedilla onesuperior ordmasculine guillemotright ' . - 'onequarter onehalf threequarters questiondown Agrave Aacute ' . - 'Acircumflex Atilde Adieresis Aring AE Ccedilla Egrave Eacute ' . - 'Ecircumflex Edieresis Igrave Iacute Icircumflex Idieresis Eth Ntilde ' . - 'Ograve Oacute Ocircumflex Otilde Odieresis multiply Oslash Ugrave ' . - 'Uacute Ucircumflex Udieresis Yacute Thorn germandbls agrave aacute ' . - 'acircumflex atilde adieresis aring ae ccedilla egrave eacute ' . - 'ecircumflex edieresis igrave iacute icircumflex idieresis eth ntilde ' . - 'ograve oacute ocircumflex otilde odieresis divide oslash ugrave ' . + '.notdef .notdef .notdef .notdef .notdef .notdef .notdef .notdef '. + '.notdef .notdef .notdef .notdef .notdef .notdef .notdef .notdef '. + '.notdef .notdef .notdef .notdef .notdef .notdef .notdef .notdef '. + '.notdef .notdef .notdef .notdef .notdef .notdef .notdef .notdef '. + 'space exclam quotedbl numbersign dollar percent ampersand quotesingle '. + 'parenleft parenright asterisk plus comma hyphen period slash zero one '. + 'two three four five six seven eight nine colon semicolon less equal '. + 'greater question at A B C D E F G H I J K L M N O P Q R S T U V W X '. + 'Y Z bracketleft backslash bracketright asciicircum underscore '. + 'grave a b c d e f g h i j k l m n o p q r s t u v w x y z '. + 'braceleft bar braceright asciitilde bullet Euro bullet quotesinglbase '. + 'florin quotedblbase ellipsis dagger daggerdbl circumflex perthousand '. + 'Scaron guilsinglleft OE bullet Zcaron bullet bullet quoteleft quoteright '. + 'quotedblleft quotedblright bullet endash emdash tilde trademark scaron '. + 'guilsinglright oe bullet zcaron Ydieresis space exclamdown cent '. + 'sterling currency yen brokenbar section dieresis copyright '. + 'ordfeminine guillemotleft logicalnot hyphen registered macron degree '. + 'plusminus twosuperior threesuperior acute mu paragraph '. + 'periodcentered cedilla onesuperior ordmasculine guillemotright '. + 'onequarter onehalf threequarters questiondown Agrave Aacute '. + 'Acircumflex Atilde Adieresis Aring AE Ccedilla Egrave Eacute '. + 'Ecircumflex Edieresis Igrave Iacute Icircumflex Idieresis Eth Ntilde '. + 'Ograve Oacute Ocircumflex Otilde Odieresis multiply Oslash Ugrave '. + 'Uacute Ucircumflex Udieresis Yacute Thorn germandbls agrave aacute '. + 'acircumflex atilde adieresis aring ae ccedilla egrave eacute '. + 'ecircumflex edieresis igrave iacute icircumflex idieresis eth ntilde '. + 'ograve oacute ocircumflex otilde odieresis divide oslash ugrave '. 'uacute ucircumflex udieresis yacute thorn ydieresis'; return explode(' ', $encoding); diff --git a/src/Smalot/PdfParser/Exception/EmptyPdfException.php b/src/Smalot/PdfParser/Exception/EmptyPdfException.php new file mode 100644 index 00000000..9eda9ce2 --- /dev/null +++ b/src/Smalot/PdfParser/Exception/EmptyPdfException.php @@ -0,0 +1,12 @@ + - * @date 2013-08-08 - * @license GPL-3.0 + * + * @date 2017-01-03 + * + * @license LGPLv3 + * * @url * * PdfParser is a pdf library written in PHP, extraction oriented. - * Copyright (C) 2014 - Sébastien MALOT + * Copyright (C) 2017 - Sébastien MALOT * * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by + * it under the terms of the GNU Lesser General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. + * GNU Lesser General Public License for more details. * - * You should have received a copy of the GNU General Public License + * You should have received a copy of the GNU Lesser General Public License * along with this program. * If not, see . - * */ namespace Smalot\PdfParser; +use Smalot\PdfParser\Encoding\WinAnsiEncoding; +use Smalot\PdfParser\Exception\EncodingNotFoundException; + /** * Class Font - * - * @package Smalot\PdfParser */ -class Font extends Object +class Font extends PDFObject { + public const MISSING = '?'; + /** - * + * @var array */ - const MISSING = '?'; + protected $table; /** * @var array */ - protected $table = null; + protected $tableSizes; /** + * Caches results from uchr. + * * @var array */ - protected $tableSizes = null; + private static $uchrCache = []; /** + * In some PDF-files encoding could be referenced by object id but object itself does not contain + * `/Type /Encoding` in its dictionary. These objects wouldn't be initialized as Encoding in + * \Smalot\PdfParser\PDFObject::factory() during file parsing (they would be just PDFObject). + * + * Therefore, we create an instance of Encoding from them during decoding and cache this value in this property. * + * @var Encoding + * + * @see https://github.com/smalot/pdfparser/pull/500 */ + private $initializedEncodingByPdfObject; + public function init() { // Load translate table. $this->loadTranslateTable(); } - /** - * @return string - */ - public function getName() + public function getName(): string { - return $this->has('BaseFont') ? (string)$this->get('BaseFont') : '[Unknown]'; + return $this->has('BaseFont') ? (string) $this->get('BaseFont') : '[Unknown]'; } - /** - * @return string - */ - public function getType() + public function getType(): string { - return (string)$this->header->get('Subtype'); + return (string) $this->header->get('Subtype'); } - /** - * @return array - */ - public function getDetails($deep = true) + public function getDetails(bool $deep = true): array { - $details = array(); + $details = []; - $details['Name'] = $this->getName(); - $details['Type'] = $this->getType(); - $details['Encoding'] = ($this->has('Encoding') ? (string)$this->get('Encoding') : 'Ansi'); + $details['Name'] = $this->getName(); + $details['Type'] = $this->getType(); + $details['Encoding'] = ($this->has('Encoding') ? (string) $this->get('Encoding') : 'Ansi'); $details += parent::getDetails($deep); @@ -94,52 +102,75 @@ public function getDetails($deep = true) } /** - * @param string $char - * @param bool $use_default - * - * @return string + * @return string|bool */ - public function translateChar($char, $use_default = true) + public function translateChar(string $char, bool $use_default = true) { $dec = hexdec(bin2hex($char)); - if (array_key_exists($dec, $this->table)) { - $char = $this->table[$dec]; - } else { - $char = ($use_default ? self::MISSING : false); + if (\array_key_exists($dec, $this->table)) { + return $this->table[$dec]; + } + + // fallback for decoding single-byte ANSI characters that are not in the lookup table + $fallbackDecoded = $char; + if ( + \strlen($char) < 2 + && $this->has('Encoding') + && $this->get('Encoding') instanceof Encoding + ) { + try { + if (WinAnsiEncoding::class === $this->get('Encoding')->__toString()) { + $fallbackDecoded = self::uchr($dec); + } + } catch (EncodingNotFoundException $e) { + // Encoding->getEncodingClass() throws EncodingNotFoundException when BaseEncoding doesn't exists + // See table 5.11 on PDF 1.5 specs for more info + } } - return $char; + return $use_default ? self::MISSING : $fallbackDecoded; } /** - * @param int $code + * Convert unicode character code to "utf-8" encoded string. * - * @return string + * @param int|float $code Unicode character code. Will be casted to int internally! */ - public static function uchr($code) + public static function uchr($code): string { - return html_entity_decode('&#' . ((int)$code) . ';', ENT_NOQUOTES, 'UTF-8'); + // note: + // $code was typed as int before, but changed in https://github.com/smalot/pdfparser/pull/623 + // because in some cases uchr was called with a float instead of an integer. + $code = (int) $code; + + if (!isset(self::$uchrCache[$code])) { + // html_entity_decode() will not work with UTF-16 or UTF-32 char entities, + // therefore, we use mb_convert_encoding() instead + self::$uchrCache[$code] = mb_convert_encoding("&#{$code};", 'UTF-8', 'HTML-ENTITIES'); + } + + return self::$uchrCache[$code]; } /** - * @return array + * Init internal chars translation table by ToUnicode CMap. */ - public function loadTranslateTable() + public function loadTranslateTable(): array { - if (!is_null($this->table)) { + if (null !== $this->table) { return $this->table; } - $this->table = array(); - $this->tableSizes = array( + $this->table = []; + $this->tableSizes = [ 'from' => 1, - 'to' => 1, - ); + 'to' => 1, + ]; if ($this->has('ToUnicode')) { $content = $this->get('ToUnicode')->getContent(); - $matches = array(); + $matches = []; // Support for multiple spacerange sections if (preg_match_all('/begincodespacerange(?P.*?)endcodespacerange/s', $content, $matches)) { @@ -148,10 +179,10 @@ public function loadTranslateTable() preg_match_all($regexp, $section, $matches); - $this->tableSizes = array( - 'from' => max(1, strlen(current($matches['from'])) / 2), - 'to' => max(1, strlen(current($matches['to'])) / 2), - ); + $this->tableSizes = [ + 'from' => max(1, \strlen(current($matches['from'])) / 2), + 'to' => max(1, \strlen(current($matches['to'])) / 2), + ]; break; } @@ -160,20 +191,20 @@ public function loadTranslateTable() // Support for multiple bfchar sections if (preg_match_all('/beginbfchar(?P.*?)endbfchar/s', $content, $matches)) { foreach ($matches['sections'] as $section) { - $regexp = '/<(?P[0-9A-F]+)> +<(?P[0-9A-F]+)>[ \r\n]+/is'; + $regexp = '/<(?P[0-9A-F]+)> *<(?P[0-9A-F]+)>[ \r\n]+/is'; preg_match_all($regexp, $section, $matches); - $this->tableSizes['from'] = max(1, strlen(current($matches['from'])) / 2); + $this->tableSizes['from'] = max(1, \strlen(current($matches['from'])) / 2); foreach ($matches['from'] as $key => $from) { $parts = preg_split( '/([0-9A-F]{4})/i', $matches['to'][$key], 0, - PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE + \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE ); - $text = ''; + $text = ''; foreach ($parts as $part) { $text .= self::uchr(hexdec($part)); } @@ -185,44 +216,53 @@ public function loadTranslateTable() // Support for multiple bfrange sections if (preg_match_all('/beginbfrange(?P.*?)endbfrange/s', $content, $matches)) { foreach ($matches['sections'] as $section) { - // Support for : - $regexp = '/<(?P[0-9A-F]+)> *<(?P[0-9A-F]+)> *<(?P[0-9A-F]+)>[ \r\n]+/is'; + /** + * Regexp to capture , , and either or [...] items. + * - (?P...) Source range's start + * - (?P...) Source range's end + * - (?P...) Destination range's offset or each char code + * Some PDF file has 2-byte Unicode values on new lines > added \r\n + */ + $regexp = '/<(?P[0-9A-F]+)> *<(?P[0-9A-F]+)> *(?P<[0-9A-F]+>|\[[\r\n<>0-9A-F ]+\])[ \r\n]+/is'; preg_match_all($regexp, $section, $matches); foreach ($matches['from'] as $key => $from) { $char_from = hexdec($from); - $char_to = hexdec($matches['to'][$key]); - $offset = hexdec($matches['offset'][$key]); + $char_to = hexdec($matches['to'][$key]); + $dest = $matches['dest'][$key]; - for ($char = $char_from; $char <= $char_to; $char++) { - $this->table[$char] = self::uchr($char - $char_from + $offset); - } - } + if (1 === preg_match('/^<(?P[0-9A-F]+)>$/i', $dest, $offset_matches)) { + // Support for : + $offset = hexdec($offset_matches['offset']); - // Support for : [ ... ] - $regexp = '/<(?P[0-9A-F]+)> *<(?P[0-9A-F]+)> *\[(?P[<>0-9A-F ]+)\][ \r\n]+/is'; - - preg_match_all($regexp, $section, $matches); + for ($char = $char_from; $char <= $char_to; ++$char) { + $this->table[$char] = self::uchr($char - $char_from + $offset); + } + } else { + // Support for : [ ... ] + $strings = []; + $matched = preg_match_all('/<(?P[0-9A-F]+)> */is', $dest, $strings); + if (false === $matched || 0 === $matched) { + continue; + } - foreach ($matches['from'] as $key => $from) { - $char_from = hexdec($from); - $strings = array(); - - preg_match_all('/<(?P[0-9A-F]+)> */is', $matches['strings'][$key], $strings); - - foreach ($strings['string'] as $position => $string) { - $parts = preg_split( - '/([0-9A-F]{4})/i', - $string, - 0, - PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE - ); - $text = ''; - foreach ($parts as $part) { - $text .= self::uchr(hexdec($part)); + foreach ($strings['string'] as $position => $string) { + $parts = preg_split( + '/([0-9A-F]{4})/i', + $string, + 0, + \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE + ); + if (false === $parts) { + continue; + } + $text = ''; + foreach ($parts as $part) { + $text .= self::uchr(hexdec($part)); + } + $this->table[$char_from + $position] = $text; } - $this->table[$char_from + $position] = $text; } } } @@ -233,24 +273,79 @@ public function loadTranslateTable() } /** - * @param string $hexa - * @param bool $add_braces + * Set custom char translation table where: + * - key - integer character code; + * - value - "utf-8" encoded value; * - * @return string + * @return void + */ + public function setTable(array $table) + { + $this->table = $table; + } + + /** + * Calculate text width with data from header 'Widths'. If width of character is not found then character is added to missing array. */ - public static function decodeHexadecimal($hexa, $add_braces = false) + public function calculateTextWidth(string $text, ?array &$missing = null): ?float { - $text = ''; - $parts = preg_split('/(<[a-z0-9]+>)/si', $hexa, -1, PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE); + $index_map = array_flip($this->table); + $details = $this->getDetails(); + + // Usually, Widths key is set in $details array, but if it isn't use an empty array instead. + $widths = $details['Widths'] ?? []; + + /* + * Widths array is zero indexed but table is not. We must map them based on FirstChar and LastChar + * + * Note: Without the change you would see warnings in PHP 8.4 because the values of FirstChar or LastChar + * can be null sometimes. + */ + $width_map = array_flip(range((int) $details['FirstChar'], (int) $details['LastChar'])); + + $width = null; + $missing = []; + $textLength = mb_strlen($text); + for ($i = 0; $i < $textLength; ++$i) { + $char = mb_substr($text, $i, 1); + if ( + !\array_key_exists($char, $index_map) + || !\array_key_exists($index_map[$char], $width_map) + || !\array_key_exists($width_map[$index_map[$char]], $widths) + ) { + $missing[] = $char; + continue; + } + $width_index = $width_map[$index_map[$char]]; + $width += $widths[$width_index]; + } + + return $width; + } + + /** + * Decode hexadecimal encoded string. If $add_braces is true result value would be wrapped by parentheses. + */ + public static function decodeHexadecimal(string $hexa, bool $add_braces = false): string + { + // Special shortcut for XML content. + if (false !== stripos($hexa, ')/si', $hexa, -1, \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE); foreach ($parts as $part) { - if (preg_match('/^<.*>$/', $part)) { + if (preg_match('/^<[a-f0-9\s]+>$/si', $part)) { + // strip whitespace + $part = preg_replace("/\s/", '', $part); $part = trim($part, '<>'); if ($add_braces) { $text .= '('; } - $part = pack("H*", $part); + $part = pack('H*', $part); $text .= ($add_braces ? preg_replace('/\\\/s', '\\\\\\', $part) : $part); if ($add_braces) { @@ -265,59 +360,50 @@ public static function decodeHexadecimal($hexa, $add_braces = false) } /** - * @param string $text - * - * @return string + * Decode string with octal-decoded chunks. */ - public static function decodeOctal($text) + public static function decodeOctal(string $text): string { - $parts = preg_split('/(\\\\\d{3})/s', $text, -1, PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE); - $text = ''; + // Replace all double backslashes \\ with a special string + $text = strtr($text, ['\\\\' => '[**pdfparserdblslsh**]']); - foreach ($parts as $part) { - if (preg_match('/^\\\\\d{3}$/', $part)) { - $text .= chr(octdec(trim($part, '\\'))); - } else { - $text .= $part; - } - } + // Now we can replace all octal codes without worrying about + // escaped backslashes + $text = preg_replace_callback('/\\\\([0-7]{1,3})/', function ($m) { + return \chr(octdec($m[1])); + }, $text); - return $text; + // Unescape any parentheses + $text = str_replace(['\\(', '\\)'], ['(', ')'], $text); + + // Replace instances of the special string with a single backslash + return str_replace('[**pdfparserdblslsh**]', '\\', $text); } /** - * @param $text - * - * @return string + * Decode string with html entity encoded chars. */ - public static function decodeEntities($text) + public static function decodeEntities(string $text): string { - $parts = preg_split('/(#\d{2})/s', $text, -1, PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE); - $text = ''; - - foreach ($parts as $part) { - if (preg_match('/^#\d{2}$/', $part)) { - $text .= chr(hexdec(trim($part, '#'))); - } else { - $text .= $part; - } - } - - return $text; + return preg_replace_callback('/#([0-9a-f]{2})/i', function ($m) { + return \chr(hexdec($m[1])); + }, $text); } /** - * @param string $text + * Check if given string is Unicode text (by BOM); + * If true - decode to "utf-8" encoded string. + * Otherwise - return text as is. * - * @return string + * @todo Rename in next major release to make the name correspond to reality (for ex. decodeIfUnicode()) */ - public static function decodeUnicode($text) + public static function decodeUnicode(string $text): string { - if (preg_match('/^\xFE\xFF/i', $text)) { + if ("\xFE\xFF" === substr($text, 0, 2)) { // Strip U+FEFF byte order marker. $decode = substr($text, 2); - $text = ''; - $length = strlen($decode); + $text = ''; + $length = \strlen($decode); for ($i = 0; $i < $length; $i += 2) { $text .= self::uchr(hexdec(bin2hex(substr($decode, $i, 2)))); @@ -328,48 +414,44 @@ public static function decodeUnicode($text) } /** - * @return int + * @todo Deprecated, use $this->config->getFontSpaceLimit() instead. */ - protected function getFontSpaceLimit() + protected function getFontSpaceLimit(): int { - return -50; + return $this->config->getFontSpaceLimit(); } /** - * @param array $commands - * - * @return string + * Decode text by commands array. */ - public function decodeText($commands) + public function decodeText(array $commands, float $fontFactor = 4): string { $word_position = 0; - $words = array(); - $unicode = false; - $font_space = $this->getFontSpaceLimit(); + $words = []; + $font_space = $this->getFontSpaceLimit() * abs($fontFactor) / 4; foreach ($commands as $command) { - switch ($command[Object::TYPE]) { + switch ($command[PDFObject::TYPE]) { case 'n': - if (floatval(trim($command[Object::COMMAND])) < $font_space) { - $word_position = count($words); + $offset = (float) trim($command[PDFObject::COMMAND]); + if ($offset - (float) $font_space < 0) { + $word_position = \count($words); } - continue(2); - + continue 2; case '<': // Decode hexadecimal. - $text = self::decodeHexadecimal('<' . $command[Object::COMMAND] . '>'); - $unicode = true; + $text = self::decodeHexadecimal('<'.$command[PDFObject::COMMAND].'>'); break; default: // Decode octal (if necessary). - $text = self::decodeOctal($command[Object::COMMAND]); + $text = self::decodeOctal($command[PDFObject::COMMAND]); } // replace escaped chars $text = str_replace( - array('\\\\', '\(', '\)', '\n', '\r', '\t', '\ '), - array('\\', '(', ')', "\n", "\r", "\t", ' '), + ['\\\\', '\(', '\)', '\n', '\r', '\t', '\f', '\ ', '\b'], + [\chr(92), \chr(40), \chr(41), \chr(10), \chr(13), \chr(9), \chr(12), \chr(32), \chr(8)], $text ); @@ -382,123 +464,246 @@ public function decodeText($commands) } foreach ($words as &$word) { - $loop_unicode = $unicode; - $word = $this->decodeContent($word, $loop_unicode); + $word = $this->decodeContent($word); + $word = str_replace("\t", ' ', $word); } - return implode(' ', $words); + // Remove internal "words" that are just spaces, but leave them + // if they are at either end of the array of words. This fixes, + // for example, lines that are justified to fill + // a whole row. + for ($x = \count($words) - 2; $x >= 1; --$x) { + if ('' === trim($words[$x], ' ')) { + unset($words[$x]); + } + } + $words = array_values($words); + + // Cut down on the number of unnecessary internal spaces by + // imploding the string on the null byte, and checking if the + // text includes extra spaces on either side. If so, merge + // where appropriate. + $words = implode("\x00\x00", $words); + $words = str_replace( + [" \x00\x00 ", "\x00\x00 ", " \x00\x00", "\x00\x00"], + [' ', ' ', ' ', ' '], + $words + ); + + return $words; } /** - * @param string $text - * @param bool $unicode + * Decode given $text to "utf-8" encoded string. * - * @return string + * @param bool $unicode This parameter is deprecated and might be removed in a future release */ - protected function decodeContent($text, &$unicode) + public function decodeContent(string $text, ?bool &$unicode = null): string { + // If this string begins with a UTF-16BE BOM, then decode it + // directly as Unicode + if ("\xFE\xFF" === substr($text, 0, 2)) { + return $this->decodeUnicode($text); + } + if ($this->has('ToUnicode')) { + return $this->decodeContentByToUnicodeCMapOrDescendantFonts($text); + } - $bytes = $this->tableSizes['from']; + if ($this->has('Encoding')) { + $result = $this->decodeContentByEncoding($text); - if ($bytes) { - $result = ''; - $length = strlen($text); + if (null !== $result) { + return $result; + } + } - for ($i = 0; $i < $length; $i += $bytes) { - $char = substr($text, $i, $bytes); + return $this->decodeContentByAutodetectIfNecessary($text); + } - if (($decoded = $this->translateChar($char, false)) !== false) { - $char = $decoded; - } elseif ($this->has('DescendantFonts')) { + /** + * First try to decode $text by ToUnicode CMap. + * If char translation not found in ToUnicode CMap tries: + * - If DescendantFonts exists tries to decode char by one of that fonts. + * - If have no success to decode by DescendantFonts interpret $text as a string with "Windows-1252" encoding. + * - If DescendantFonts does not exist just return "?" as decoded char. + * + * @todo Seems this is invalid algorithm that do not follow pdf-format specification. Must be rewritten. + */ + private function decodeContentByToUnicodeCMapOrDescendantFonts(string $text): string + { + $bytes = $this->tableSizes['from']; - if ($this->get('DescendantFonts') instanceof Object) { - $fonts = $this->get('DescendantFonts')->getHeader()->getElements(); - } else { - $fonts = $this->get('DescendantFonts')->getContent(); - } - $decoded = false; + if ($bytes) { + $result = ''; + $length = \strlen($text); - foreach ($fonts as $font) { - if ($font instanceof Font) { - if (($decoded = $font->translateChar($char, false)) !== false) { - $decoded = @iconv('Windows-1252', 'UTF-8//TRANSLIT//IGNORE', $decoded); - break; - } + for ($i = 0; $i < $length; $i += $bytes) { + $char = substr($text, $i, $bytes); + + if (false !== ($decoded = $this->translateChar($char, false))) { + $char = $decoded; + } elseif ($this->has('DescendantFonts')) { + if ($this->get('DescendantFonts') instanceof PDFObject) { + $fonts = $this->get('DescendantFonts')->getHeader()->getElements(); + } else { + $fonts = $this->get('DescendantFonts')->getContent(); + } + $decoded = false; + + foreach ($fonts as $font) { + if ($font instanceof self) { + if (false !== ($decoded = $font->translateChar($char, false))) { + $decoded = mb_convert_encoding($decoded, 'UTF-8', 'Windows-1252'); + break; } } + } - if ($decoded !== false) { - $char = $decoded; - } else { - $char = @iconv('Windows-1252', 'UTF-8//TRANSLIT//IGNORE', $char); - } + if (false !== $decoded) { + $char = $decoded; } else { - $char = self::MISSING; + $char = mb_convert_encoding($char, 'UTF-8', 'Windows-1252'); } - - $result .= $char; + } else { + $char = self::MISSING; } - $text = $result; - - // By definition, this code generates unicode chars. - $unicode = true; + $result .= $char; } - } elseif ($this->has('Encoding')) { - /** @var Encoding $encoding */ - $encoding = $this->get('Encoding'); - - if ($encoding instanceof Encoding) { - if ($unicode) { - $chars = preg_split( - '//s' . ($unicode ? 'u' : ''), - $text, - -1, - PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY - ); - $result = ''; - - foreach ($chars as $char) { - $dec_av = hexdec(bin2hex($char)); - $dec_ap = $encoding->translateChar($dec_av); - $result .= self::uchr($dec_ap); - } - $text = $result; - } else { - $result = ''; - $length = strlen($text); + $text = $result; + } - for ($i = 0; $i < $length; $i++) { - $dec_av = hexdec(bin2hex($text[$i])); - $dec_ap = $encoding->translateChar($dec_av); - $result .= chr($dec_ap); - } + return $text; + } - $text = $result; + /** + * Decode content by any type of Encoding (dictionary's item) instance. + */ + private function decodeContentByEncoding(string $text): ?string + { + $encoding = $this->get('Encoding'); - if ($encoding->get('BaseEncoding')->equals('MacRomanEncoding')) { - $text = @iconv('Mac', 'UTF-8//TRANSLIT//IGNORE', $text); + // When Encoding referenced by object id (/Encoding 520 0 R) but object itself does not contain `/Type /Encoding` in it's dictionary. + if ($encoding instanceof PDFObject) { + $encoding = $this->getInitializedEncodingByPdfObject($encoding); + } - return $text; - } - } - } + // When Encoding referenced by object id (/Encoding 520 0 R) but object itself contains `/Type /Encoding` in it's dictionary. + if ($encoding instanceof Encoding) { + return $this->decodeContentByEncodingEncoding($text, $encoding); } - // Convert to unicode if not already done. - if (!$unicode) { + // When Encoding is just string (/Encoding /WinAnsiEncoding) + if ($encoding instanceof Element) { // todo: ElementString class must by used? + return $this->decodeContentByEncodingElement($text, $encoding); + } - if ($this->get('Encoding') instanceof Element && - $this->get('Encoding')->equals('MacRomanEncoding') - ) { - $text = @iconv('Mac', 'UTF-8//TRANSLIT//IGNORE', $text); - } else { - $text = @iconv('Windows-1252', 'UTF-8//TRANSLIT//IGNORE', $text); - } + // don't double-encode strings already in UTF-8 + if (!mb_check_encoding($text, 'UTF-8')) { + return mb_convert_encoding($text, 'UTF-8', 'Windows-1252'); } return $text; } + + /** + * Returns already created or create a new one if not created before Encoding instance by PDFObject instance. + */ + private function getInitializedEncodingByPdfObject(PDFObject $PDFObject): Encoding + { + if (!$this->initializedEncodingByPdfObject) { + $this->initializedEncodingByPdfObject = $this->createInitializedEncodingByPdfObject($PDFObject); + } + + return $this->initializedEncodingByPdfObject; + } + + /** + * Decode content when $encoding (given by $this->get('Encoding')) is instance of Encoding. + */ + private function decodeContentByEncodingEncoding(string $text, Encoding $encoding): string + { + $result = ''; + $length = \strlen($text); + + for ($i = 0; $i < $length; ++$i) { + $dec_av = hexdec(bin2hex($text[$i])); + $dec_ap = $encoding->translateChar($dec_av); + $result .= self::uchr($dec_ap ?? $dec_av); + } + + return $result; + } + + /** + * Decode content when $encoding (given by $this->get('Encoding')) is instance of Element. + */ + private function decodeContentByEncodingElement(string $text, Element $encoding): ?string + { + $pdfEncodingName = $encoding->getContent(); + + // mb_convert_encoding does not support MacRoman/macintosh, + // so we use iconv() here + $iconvEncodingName = $this->getIconvEncodingNameOrNullByPdfEncodingName($pdfEncodingName); + + return $iconvEncodingName ? iconv($iconvEncodingName, 'UTF-8//TRANSLIT//IGNORE', $text) : null; + } + + /** + * Convert PDF encoding name to iconv-known encoding name. + */ + private function getIconvEncodingNameOrNullByPdfEncodingName(string $pdfEncodingName): ?string + { + $pdfToIconvEncodingNameMap = [ + 'StandardEncoding' => 'ISO-8859-1', + 'MacRomanEncoding' => 'MACINTOSH', + 'WinAnsiEncoding' => 'CP1252', + ]; + + return \array_key_exists($pdfEncodingName, $pdfToIconvEncodingNameMap) + ? $pdfToIconvEncodingNameMap[$pdfEncodingName] + : null; + } + + /** + * If string seems like "utf-8" encoded string do nothing and just return given string as is. + * Otherwise, interpret string as "Window-1252" encoded string. + * + * @return string|false + */ + private function decodeContentByAutodetectIfNecessary(string $text) + { + if (mb_check_encoding($text, 'UTF-8')) { + return $text; + } + + return mb_convert_encoding($text, 'UTF-8', 'Windows-1252'); + // todo: Why exactly `Windows-1252` used? + } + + /** + * Create Encoding instance by PDFObject instance and init it. + */ + private function createInitializedEncodingByPdfObject(PDFObject $PDFObject): Encoding + { + $encoding = $this->createEncodingByPdfObject($PDFObject); + $encoding->init(); + + return $encoding; + } + + /** + * Create Encoding instance by PDFObject instance (without init). + */ + private function createEncodingByPdfObject(PDFObject $PDFObject): Encoding + { + $document = $PDFObject->getDocument(); + $header = $PDFObject->getHeader(); + $content = $PDFObject->getContent(); + $config = $PDFObject->getConfig(); + + return new Encoding($document, $header, $content, $config); + } } diff --git a/src/Smalot/PdfParser/Font/FontCIDFontType0.php b/src/Smalot/PdfParser/Font/FontCIDFontType0.php index bf8a233c..310c44cf 100644 --- a/src/Smalot/PdfParser/Font/FontCIDFontType0.php +++ b/src/Smalot/PdfParser/Font/FontCIDFontType0.php @@ -5,27 +5,29 @@ * This file is part of the PdfParser library. * * @author Sébastien MALOT - * @date 2013-08-08 - * @license GPL-3.0 + * + * @date 2017-01-03 + * + * @license LGPLv3 + * * @url * * PdfParser is a pdf library written in PHP, extraction oriented. - * Copyright (C) 2014 - Sébastien MALOT + * Copyright (C) 2017 - Sébastien MALOT * * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by + * it under the terms of the GNU Lesser General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. + * GNU Lesser General Public License for more details. * - * You should have received a copy of the GNU General Public License + * You should have received a copy of the GNU Lesser General Public License * along with this program. * If not, see . - * */ namespace Smalot\PdfParser\Font; @@ -34,8 +36,6 @@ /** * Class FontCIDFontType0 - * - * @package Smalot\PdfParser\Font */ class FontCIDFontType0 extends Font { diff --git a/src/Smalot/PdfParser/Font/FontCIDFontType2.php b/src/Smalot/PdfParser/Font/FontCIDFontType2.php index abd90f31..077d6e7a 100644 --- a/src/Smalot/PdfParser/Font/FontCIDFontType2.php +++ b/src/Smalot/PdfParser/Font/FontCIDFontType2.php @@ -5,27 +5,29 @@ * This file is part of the PdfParser library. * * @author Sébastien MALOT - * @date 2013-08-08 - * @license GPL-3.0 + * + * @date 2017-01-03 + * + * @license LGPLv3 + * * @url * * PdfParser is a pdf library written in PHP, extraction oriented. - * Copyright (C) 2014 - Sébastien MALOT + * Copyright (C) 2017 - Sébastien MALOT * * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by + * it under the terms of the GNU Lesser General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. + * GNU Lesser General Public License for more details. * - * You should have received a copy of the GNU General Public License + * You should have received a copy of the GNU Lesser General Public License * along with this program. * If not, see . - * */ namespace Smalot\PdfParser\Font; @@ -34,8 +36,6 @@ /** * Class FontCIDFontType2 - * - * @package Smalot\PdfParser\Font */ class FontCIDFontType2 extends Font { diff --git a/src/Smalot/PdfParser/Font/FontTrueType.php b/src/Smalot/PdfParser/Font/FontTrueType.php index 23f0def8..8a55c004 100644 --- a/src/Smalot/PdfParser/Font/FontTrueType.php +++ b/src/Smalot/PdfParser/Font/FontTrueType.php @@ -5,27 +5,29 @@ * This file is part of the PdfParser library. * * @author Sébastien MALOT - * @date 2013-08-08 - * @license GPL-3.0 + * + * @date 2017-01-03 + * + * @license LGPLv3 + * * @url * * PdfParser is a pdf library written in PHP, extraction oriented. - * Copyright (C) 2014 - Sébastien MALOT + * Copyright (C) 2017 - Sébastien MALOT * * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by + * it under the terms of the GNU Lesser General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. + * GNU Lesser General Public License for more details. * - * You should have received a copy of the GNU General Public License + * You should have received a copy of the GNU Lesser General Public License * along with this program. * If not, see . - * */ namespace Smalot\PdfParser\Font; @@ -34,8 +36,6 @@ /** * Class FontTrueType - * - * @package Smalot\PdfParser\Font */ class FontTrueType extends Font { diff --git a/src/Smalot/PdfParser/Font/FontType0.php b/src/Smalot/PdfParser/Font/FontType0.php index 9460e38b..4e5cc6db 100644 --- a/src/Smalot/PdfParser/Font/FontType0.php +++ b/src/Smalot/PdfParser/Font/FontType0.php @@ -5,27 +5,29 @@ * This file is part of the PdfParser library. * * @author Sébastien MALOT - * @date 2013-08-08 - * @license GPL-3.0 + * + * @date 2017-01-03 + * + * @license LGPLv3 + * * @url * * PdfParser is a pdf library written in PHP, extraction oriented. - * Copyright (C) 2014 - Sébastien MALOT + * Copyright (C) 2017 - Sébastien MALOT * * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by + * it under the terms of the GNU Lesser General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. + * GNU Lesser General Public License for more details. * - * You should have received a copy of the GNU General Public License + * You should have received a copy of the GNU Lesser General Public License * along with this program. * If not, see . - * */ namespace Smalot\PdfParser\Font; @@ -34,8 +36,6 @@ /** * Class FontType0 - * - * @package Smalot\PdfParser\Font */ class FontType0 extends Font { diff --git a/src/Smalot/PdfParser/Font/FontType1.php b/src/Smalot/PdfParser/Font/FontType1.php index 6c117188..ee93e691 100644 --- a/src/Smalot/PdfParser/Font/FontType1.php +++ b/src/Smalot/PdfParser/Font/FontType1.php @@ -5,27 +5,29 @@ * This file is part of the PdfParser library. * * @author Sébastien MALOT - * @date 2013-08-08 - * @license GPL-3.0 + * + * @date 2017-01-03 + * + * @license LGPLv3 + * * @url * * PdfParser is a pdf library written in PHP, extraction oriented. - * Copyright (C) 2014 - Sébastien MALOT + * Copyright (C) 2017 - Sébastien MALOT * * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by + * it under the terms of the GNU Lesser General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. + * GNU Lesser General Public License for more details. * - * You should have received a copy of the GNU General Public License + * You should have received a copy of the GNU Lesser General Public License * along with this program. * If not, see . - * */ namespace Smalot\PdfParser\Font; @@ -34,8 +36,6 @@ /** * Class FontType1 - * - * @package Smalot\PdfParser\Font */ class FontType1 extends Font { diff --git a/src/Smalot/PdfParser/Font/FontType3.php b/src/Smalot/PdfParser/Font/FontType3.php new file mode 100644 index 00000000..08f8da04 --- /dev/null +++ b/src/Smalot/PdfParser/Font/FontType3.php @@ -0,0 +1,42 @@ + + * + * @date 2017-01-03 + * + * @license LGPLv3 + * + * @url + * + * PdfParser is a pdf library written in PHP, extraction oriented. + * Copyright (C) 2017 - Sébastien MALOT + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program. + * If not, see . + */ + +namespace Smalot\PdfParser\Font; + +use Smalot\PdfParser\Font; + +/** + * Class FontType3 + */ +class FontType3 extends Font +{ +} diff --git a/src/Smalot/PdfParser/Header.php b/src/Smalot/PdfParser/Header.php index 01d6dbca..b58773a5 100644 --- a/src/Smalot/PdfParser/Header.php +++ b/src/Smalot/PdfParser/Header.php @@ -5,27 +5,29 @@ * This file is part of the PdfParser library. * * @author Sébastien MALOT - * @date 2013-08-08 - * @license GPL-3.0 + * + * @date 2017-01-03 + * + * @license LGPLv3 + * * @url * * PdfParser is a pdf library written in PHP, extraction oriented. - * Copyright (C) 2014 - Sébastien MALOT + * Copyright (C) 2017 - Sébastien MALOT * * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by + * it under the terms of the GNU Lesser General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. + * GNU Lesser General Public License for more details. * - * You should have received a copy of the GNU General Public License + * You should have received a copy of the GNU Lesser General Public License * along with this program. * If not, see . - * */ namespace Smalot\PdfParser; @@ -37,35 +39,40 @@ /** * Class Header - * - * @package Smalot\PdfParser */ class Header { /** - * @var Document + * @var Document|null */ - protected $document = null; + protected $document; /** * @var Element[] */ - protected $elements = null; + protected $elements; /** - * @param Element[] $elements List of elements. - * @param Document $document Document. + * @param Element[] $elements list of elements + * @param Document $document document */ - public function __construct($elements = array(), Document $document = null) + public function __construct(array $elements = [], ?Document $document = null) { $this->elements = $elements; $this->document = $document; } + public function init() + { + foreach ($this->elements as $element) { + if ($element instanceof Element) { + $element->init(); + } + } + } + /** * Returns all elements. - * - * @return mixed */ public function getElements() { @@ -78,34 +85,27 @@ public function getElements() /** * Used only for debug. - * - * @return array */ - public function getElementTypes() + public function getElementTypes(): array { - $types = array(); + $types = []; foreach ($this->elements as $key => $element) { - $types[$key] = get_class($element); + $types[$key] = \get_class($element); } return $types; } - /** - * @param bool $deep - * - * @return array - */ - public function getDetails($deep = true) + public function getDetails(bool $deep = true): array { - $values = array(); + $values = []; $elements = $this->getElements(); foreach ($elements as $key => $element) { - if ($element instanceof Header && $deep) { + if ($element instanceof self && $deep) { $values[$key] = $element->getDetails($deep); - } elseif ($element instanceof Object && $deep) { + } elseif ($element instanceof PDFObject && $deep) { $values[$key] = $element->getDetails(false); } elseif ($element instanceof ElementArray) { if ($deep) { @@ -122,49 +122,40 @@ public function getDetails($deep = true) /** * Indicate if an element name is available in header. * - * @param string $name The name of the element - * - * @return bool + * @param string $name the name of the element */ - public function has($name) + public function has(string $name): bool { - if (array_key_exists($name, $this->elements)) { - return true; - } else { - return false; - } + return \array_key_exists($name, $this->elements); } /** - * @param string $name - * - * @return Element|Object + * @return Element|PDFObject */ - public function get($name) + public function get(string $name) { - if (array_key_exists($name, $this->elements)) { - return $this->resolveXRef($name); + if (\array_key_exists($name, $this->elements) && $element = $this->resolveXRef($name)) { + return $element; } - return new ElementMissing(null, null); + return new ElementMissing(); } /** * Resolve XRef to object. * - * @param string $name + * @return Element|PDFObject * - * @return Element|Object * @throws \Exception */ - protected function resolveXRef($name) + protected function resolveXRef(string $name) { - if (($obj = $this->elements[$name]) instanceof ElementXRef && !is_null($this->document)) { + if (($obj = $this->elements[$name]) instanceof ElementXRef && null !== $this->document) { /** @var ElementXRef $obj */ $object = $this->document->getObjectById($obj->getId()); - if (is_null($object)) { - throw new \Exception('Missing object reference #' . $obj->getId() . '.'); + if (null === $object) { + return new ElementMissing(); } // Update elements list for future calls. @@ -178,28 +169,26 @@ protected function resolveXRef($name) * @param string $content The content to parse * @param Document $document The document * @param int $position The new position of the cursor after parsing - * - * @return Header */ - public static function parse($content, Document $document, &$position = 0) + public static function parse(string $content, Document $document, int &$position = 0): self { - /** @var Header $header */ - if (substr(trim($content), 0, 2) == '<<') { + /* @var Header $header */ + if ('<<' == substr(trim($content), 0, 2)) { $header = ElementStruct::parse($content, $document, $position); } else { $elements = ElementArray::parse($content, $document, $position); + $header = new self([], $document); + if ($elements) { - $header = new self($elements->getRawContent(), null);//$document); - } else { - $header = new self(array(), $document); + $header = new self($elements->getRawContent(), null); } } if ($header) { return $header; - } else { - // Build an empty header. - return new self(array(), $document); } + + // Build an empty header. + return new self([], $document); } } diff --git a/src/Smalot/PdfParser/Object.php b/src/Smalot/PdfParser/Object.php deleted file mode 100644 index 8e47606e..00000000 --- a/src/Smalot/PdfParser/Object.php +++ /dev/null @@ -1,606 +0,0 @@ - - * @date 2013-08-08 - * @license GPL-3.0 - * @url - * - * PdfParser is a pdf library written in PHP, extraction oriented. - * Copyright (C) 2014 - Sébastien MALOT - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. - * If not, see . - * - */ - -namespace Smalot\PdfParser; - -use Smalot\PdfParser\XObject\Form; -use Smalot\PdfParser\XObject\Image; - -/** - * Class Object - * - * @package Smalot\PdfParser - */ -class Object -{ - const TYPE = 't'; - - const OPERATOR = 'o'; - - const COMMAND = 'c'; - - /** - * @var Document - */ - protected $document = null; - - /** - * @var Header - */ - protected $header = null; - - /** - * @var string - */ - protected $content = null; - - /** - * @param Document $document - * @param Header $header - * @param string $content - */ - public function __construct(Document $document, Header $header = null, $content = null) - { - $this->document = $document; - $this->header = !is_null($header) ? $header : new Header(); - $this->content = $content; - } - - /** - * - */ - public function init() - { - - } - - /** - * @return null|Header - */ - public function getHeader() - { - return $this->header; - } - - /** - * @param string $name - * - * @return Element|Object - */ - public function get($name) - { - return $this->header->get($name); - } - - /** - * @param $name - * - * @return bool - */ - public function has($name) - { - return $this->header->has($name); - } - - /** - * @param bool $deep - * - * @return array - */ - public function getDetails($deep = true) - { - return $this->header->getDetails($deep); - } - - /** - * @return null|string - */ - public function getContent() - { - return $this->content; - } - - /** - * @param $content - */ - public function cleanContent($content, $char = 'X') - { - $char = $char[0]; - $content = str_replace(array('\\\\', '\\)', '\\('), $char . $char, $content); - - // Remove image bloc with binary content - preg_match_all('/\s(BI\s.*?(\sID\s).*?(\sEI))\s/s', $content, $matches, PREG_OFFSET_CAPTURE); - foreach ($matches[0] as $part) { - $content = substr_replace($content, str_repeat($char, strlen($part[0])), $part[1], strlen($part[0])); - } - - // Clean content in square brackets [.....] - preg_match_all('/\[((\(.*?\)|[0-9\.\-\s]*)*)\]/s', $content, $matches, PREG_OFFSET_CAPTURE); - foreach ($matches[1] as $part) { - $content = substr_replace($content, str_repeat($char, strlen($part[0])), $part[1], strlen($part[0])); - } - - // Clean content in round brackets (.....) - preg_match_all('/\((.*?)\)/s', $content, $matches, PREG_OFFSET_CAPTURE); - foreach ($matches[1] as $part) { - $content = substr_replace($content, str_repeat($char, strlen($part[0])), $part[1], strlen($part[0])); - } - - // Clean structure - if ($parts = preg_split('/(<|>)/s', $content, -1, PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE)) { - $content = ''; - $level = 0; - foreach ($parts as $part) { - if ($part == '<') { - $level++; - } - - $content .= ($level == 0 ? $part : str_repeat($char, strlen($part))); - - if ($part == '>') { - $level--; - } - } - } - - // Clean BDC and EMC markup - preg_match_all( - '/(\/[A-Za-z0-9\_]*\s*' . preg_quote($char) . '*BDC)/s', - $content, - $matches, - PREG_OFFSET_CAPTURE - ); - foreach ($matches[1] as $part) { - $content = substr_replace($content, str_repeat($char, strlen($part[0])), $part[1], strlen($part[0])); - } - - preg_match_all('/\s(EMC)\s/s', $content, $matches, PREG_OFFSET_CAPTURE); - foreach ($matches[1] as $part) { - $content = substr_replace($content, str_repeat($char, strlen($part[0])), $part[1], strlen($part[0])); - } - - return $content; - } - - /** - * @param $content - * - * @return array - */ - public function getSectionsText($content) - { - $sections = array(); - $content = ' ' . $content . ' '; - $textCleaned = $this->cleanContent($content, '_'); - - // Extract text blocks. - if (preg_match_all('/\s+BT[\s|\(|\[]+(.*?)\s+ET/s', $textCleaned, $matches, PREG_OFFSET_CAPTURE)) { - foreach ($matches[1] as $part) { - $text = $part[0]; - $offset = $part[1]; - $section = substr($content, $offset, strlen($text)); - - // Removes BDC and EMC markup. - $section = preg_replace('/(\/[A-Za-z0-9]+\s*<<.*?)(>>\s*BDC)(.*?)(EMC\s+)/s', '${3}', $section . ' '); - - $sections[] = $section; - } - } - - // Extract 'do' commands. - if (preg_match_all('/(\/[A-Za-z0-9\.\-_]+\s+Do)\s/s', $textCleaned, $matches, PREG_OFFSET_CAPTURE)) { - foreach ($matches[1] as $part) { - $text = $part[0]; - $offset = $part[1]; - $section = substr($content, $offset, strlen($text)); - - $sections[] = $section; - } - } - - return $sections; - } - - /** - * @param Page - * - * @return string - * @throws \Exception - */ - public function getText(Page $page = null) - { - $text = ''; - $sections = $this->getSectionsText($this->content); - $current_font = new Font($this->document); - $current_position_td = array('x' => false, 'y' => false); - $current_position_tm = array('x' => false, 'y' => false); - - foreach ($sections as $section) { - - $commands = $this->getCommandsText($section); - - foreach ($commands as $command) { - - switch ($command[self::OPERATOR]) { - // set character spacing - case 'Tc': - break; - - // move text current point - case 'Td': - $args = preg_split('/\s/s', $command[self::COMMAND]); - $y = array_pop($args); - $x = array_pop($args); - if ((floatval($x) <= 0) || - ($current_position_td['y'] !== false && floatval($y) < floatval($current_position_td['y'])) - ) { - // vertical offset - $text .= "\n"; - } elseif ($current_position_td['x'] !== false && floatval($x) > floatval( - $current_position_td['x'] - ) - ) { - // horizontal offset - $text .= ' '; - } - $current_position_td = array('x' => $x, 'y' => $y); - break; - - // move text current point and set leading - case 'TD': - $args = preg_split('/\s/s', $command[self::COMMAND]); - $y = array_pop($args); - $x = array_pop($args); - if (floatval($y) < 0) { - $text .= "\n"; - } elseif (floatval($x) <= 0) { - $text .= ' '; - } - break; - - case 'Tf': - list($id,) = preg_split('/\s/s', $command[self::COMMAND]); - $id = trim($id, '/'); - $current_font = $page->getFont($id); - break; - - case "'": - case 'Tj': - $command[self::COMMAND] = array($command); - case 'TJ': - // Skip if not previously defined, should never happened. - if (is_null($current_font)) { - // Fallback - // TODO : Improve - $text .= $command[self::COMMAND][0][self::COMMAND]; - continue; - } - - $sub_text = $current_font->decodeText($command[self::COMMAND]); - $text .= $sub_text; - break; - - // set leading - case 'TL': - $text .= ' '; - break; - - case 'Tm': - $args = preg_split('/\s/s', $command[self::COMMAND]); - $y = array_pop($args); - $x = array_pop($args); - if ($current_position_tm['y'] !== false) { - $delta = abs(floatval($y) - floatval($current_position_tm['y'])); - if ($delta > 10) { - $text .= "\n"; - } - } - $current_position_tm = array('x' => $x, 'y' => $y); - break; - - // set super/subscripting text rise - case 'Ts': - break; - - // set word spacing - case 'Tw': - break; - - // set horizontal scaling - case 'Tz': - $text .= "\n"; - break; - - // move to start of next line - case 'T*': - $text .= "\n"; - break; - - case 'Da': - break; - - case 'Do': - if (!is_null($page)) { - $args = preg_split('/\s/s', $command[self::COMMAND]); - $id = trim(array_pop($args), '/ '); - if ($xobject = $page->getXObject($id)) { - $text .= $xobject->getText($page); - } - } - break; - - case 'rg': - case 'RG': - break; - - case 're': - break; - - case 'co': - break; - - case 'cs': - break; - - case 'gs': - break; - - case 'en': - break; - - case 'sc': - case 'SC': - break; - - case 'g': - case 'G': - break; - - case 'V': - break; - - case 'vo': - case 'Vo': - break; - - default: - } - } - } - - return $text . ' '; - } - - /** - * @param string $text_part - * @param int $offset - * - * @return array - */ - public function getCommandsText($text_part, &$offset = 0) - { - $commands = $matches = array(); - - while ($offset < strlen($text_part)) { - $offset += strspn($text_part, "\x00\x09\x0a\x0c\x0d\x20", $offset); - $char = $text_part[$offset]; - - $operator = ''; - $type = ''; - $command = false; - - switch ($char) { - case '/': - $type = $char; - if (preg_match( - '/^\/([A-Z0-9\._,\+]+\s+[0-9.\-]+)\s+([A-Z]+)\s*/si', - substr($text_part, $offset), - $matches - ) - ) { - $operator = $matches[2]; - $command = $matches[1]; - $offset += strlen($matches[0]); - } elseif (preg_match( - '/^\/([A-Z0-9\._,\+]+)\s+([A-Z]+)\s*/si', - substr($text_part, $offset), - $matches - ) - ) { - $operator = $matches[2]; - $command = $matches[1]; - $offset += strlen($matches[0]); - } - break; - - case '[': - case ']': - // array object - $type = $char; - if ($char == '[') { - ++$offset; - // get elements - $command = $this->getCommandsText($text_part, $offset); - - if (preg_match('/^\s*[A-Z]{1,2}\s*/si', substr($text_part, $offset), $matches)) { - $operator = trim($matches[0]); - $offset += strlen($matches[0]); - } - } else { - ++$offset; - break; - } - break; - - case '<': - case '>': - // array object - $type = $char; - ++$offset; - if ($char == '<') { - $strpos = strpos($text_part, '>', $offset); - $command = substr($text_part, $offset, ($strpos - $offset)); - $offset = $strpos + 1; - } - - if (preg_match('/^\s*[A-Z]{1,2}\s*/si', substr($text_part, $offset), $matches)) { - $operator = trim($matches[0]); - $offset += strlen($matches[0]); - } - break; - - case '(': - case ')': - ++$offset; - $type = $char; - $strpos = $offset; - if ($char == '(') { - $open_bracket = 1; - while ($open_bracket > 0) { - if (!isset($text_part[$strpos])) { - break; - } - $ch = $text_part[$strpos]; - switch ($ch) { - case '\\': - { // REVERSE SOLIDUS (5Ch) (Backslash) - // skip next character - ++$strpos; - break; - } - case '(': - { // LEFT PARENHESIS (28h) - ++$open_bracket; - break; - } - case ')': - { // RIGHT PARENTHESIS (29h) - --$open_bracket; - break; - } - } - ++$strpos; - } - $command = substr($text_part, $offset, ($strpos - $offset - 1)); - $offset = $strpos; - - if (preg_match('/^\s*([A-Z\']{1,2})\s*/si', substr($text_part, $offset), $matches)) { - $operator = $matches[1]; - $offset += strlen($matches[0]); - } - } - break; - - default: - - if (substr($text_part, $offset, 2) == 'ET') { - break; - } elseif (preg_match( - '/^\s*(?P([0-9\.\-]+\s*?)+)\s+(?P[A-Z]{1,3})\s*/si', - substr($text_part, $offset), - $matches - ) - ) { - $operator = trim($matches['id']); - $command = trim($matches['data']); - $offset += strlen($matches[0]); - } elseif (preg_match('/^\s*([0-9\.\-]+\s*?)+\s*/si', substr($text_part, $offset), $matches)) { - $type = 'n'; - $command = trim($matches[0]); - $offset += strlen($matches[0]); - } elseif (preg_match('/^\s*([A-Z\*]+)\s*/si', substr($text_part, $offset), $matches)) { - $type = ''; - $operator = $matches[1]; - $command = ''; - $offset += strlen($matches[0]); - } - } - - if ($command !== false) { - $commands[] = array( - self::TYPE => $type, - self::OPERATOR => $operator, - self::COMMAND => $command, - ); - } else { - break; - } - } - - return $commands; - } - - /** - * @param $document Document - * @param $header Header - * @param $content string - * - * @return Object - */ - public static function factory(Document $document, Header $header, $content) - { - switch ($header->get('Type')->getContent()) { - case 'XObject': - switch ($header->get('Subtype')->getContent()) { - case 'Image': - return new Image($document, $header, $content); - - case 'Form': - return new Form($document, $header, $content); - - default: - return new Object($document, $header, $content); - } - break; - - case 'Pages': - return new Pages($document, $header, $content); - - case 'Page': - return new Page($document, $header, $content); - - case 'Encoding': - return new Encoding($document, $header, $content); - - case 'Font': - $subtype = $header->get('Subtype')->getContent(); - $classname = '\Smalot\PdfParser\Font\Font' . $subtype; - - if (class_exists($classname)) { - return new $classname($document, $header, $content); - } else { - return new Font($document, $header, $content); - } - - default: - return new Object($document, $header, $content); - } - } -} diff --git a/src/Smalot/PdfParser/PDFObject.php b/src/Smalot/PdfParser/PDFObject.php new file mode 100644 index 00000000..61d23edb --- /dev/null +++ b/src/Smalot/PdfParser/PDFObject.php @@ -0,0 +1,1205 @@ + + * + * @date 2017-01-03 + * + * @license LGPLv3 + * + * @url + * + * PdfParser is a pdf library written in PHP, extraction oriented. + * Copyright (C) 2017 - Sébastien MALOT + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program. + * If not, see . + */ + +namespace Smalot\PdfParser; + +use Smalot\PdfParser\Exception\InvalidDictionaryObjectException; +use Smalot\PdfParser\XObject\Form; +use Smalot\PdfParser\XObject\Image; + +/** + * Class PDFObject + */ +class PDFObject +{ + public const TYPE = 't'; + + public const OPERATOR = 'o'; + + public const COMMAND = 'c'; + + /** + * The recursion stack. + * + * @var array + */ + public static $recursionStack = []; + + /** + * @var Document|null + */ + protected $document; + + /** + * @var Header + */ + protected $header; + + /** + * @var string + */ + protected $content; + + /** + * @var Config|null + */ + protected $config; + + /** + * @var bool + */ + protected $addPositionWhitespace = false; + + public function __construct( + Document $document, + ?Header $header = null, + ?string $content = null, + ?Config $config = null + ) { + $this->document = $document; + $this->header = $header ?? new Header(); + $this->content = $content; + $this->config = $config; + } + + public function init() + { + } + + public function getDocument(): Document + { + return $this->document; + } + + public function getHeader(): ?Header + { + return $this->header; + } + + public function getConfig(): ?Config + { + return $this->config; + } + + /** + * @return Element|PDFObject|Header + */ + public function get(string $name) + { + return $this->header->get($name); + } + + public function has(string $name): bool + { + return $this->header->has($name); + } + + public function getDetails(bool $deep = true): array + { + return $this->header->getDetails($deep); + } + + public function getContent(): ?string + { + return $this->content; + } + + /** + * Creates a duplicate of the document stream with + * strings and other items replaced by $char. Formerly + * getSectionsText() used this output to more easily gather offset + * values to extract text from the *actual* document stream. + * + * @deprecated function is no longer used and will be removed in a future release + * + * @internal + */ + public function cleanContent(string $content, string $char = 'X') + { + $char = $char[0]; + $content = str_replace(['\\\\', '\\)', '\\('], $char.$char, $content); + + // Remove image bloc with binary content + preg_match_all('/\s(BI\s.*?(\sID\s).*?(\sEI))\s/s', $content, $matches, \PREG_OFFSET_CAPTURE); + foreach ($matches[0] as $part) { + $content = substr_replace($content, str_repeat($char, \strlen($part[0])), $part[1], \strlen($part[0])); + } + + // Clean content in square brackets [.....] + preg_match_all('/\[((\(.*?\)|[0-9\.\-\s]*)*)\]/s', $content, $matches, \PREG_OFFSET_CAPTURE); + foreach ($matches[1] as $part) { + $content = substr_replace($content, str_repeat($char, \strlen($part[0])), $part[1], \strlen($part[0])); + } + + // Clean content in round brackets (.....) + preg_match_all('/\((.*?)\)/s', $content, $matches, \PREG_OFFSET_CAPTURE); + foreach ($matches[1] as $part) { + $content = substr_replace($content, str_repeat($char, \strlen($part[0])), $part[1], \strlen($part[0])); + } + + // Clean structure + if ($parts = preg_split('/(<|>)/s', $content, -1, \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE)) { + $content = ''; + $level = 0; + foreach ($parts as $part) { + if ('<' == $part) { + ++$level; + } + + $content .= (0 == $level ? $part : str_repeat($char, \strlen($part))); + + if ('>' == $part) { + --$level; + } + } + } + + // Clean BDC and EMC markup + preg_match_all( + '/(\/[A-Za-z0-9\_]*\s*'.preg_quote($char).'*BDC)/s', + $content, + $matches, + \PREG_OFFSET_CAPTURE + ); + foreach ($matches[1] as $part) { + $content = substr_replace($content, str_repeat($char, \strlen($part[0])), $part[1], \strlen($part[0])); + } + + preg_match_all('/\s(EMC)\s/s', $content, $matches, \PREG_OFFSET_CAPTURE); + foreach ($matches[1] as $part) { + $content = substr_replace($content, str_repeat($char, \strlen($part[0])), $part[1], \strlen($part[0])); + } + + return $content; + } + + /** + * Takes a string of PDF document stream text and formats + * it into a multi-line string with one PDF command on each line, + * separated by \r\n. If the given string is null, or binary data + * is detected instead of a document stream then return an empty + * string. + */ + private function formatContent(?string $content): string + { + if (null === $content) { + return ''; + } + + // Outside of (String) and inline image content in PDF document + // streams, all text should conform to UTF-8. Test for binary + // content by deleting everything after the first open- + // parenthesis ( which indicates the beginning of a string, or + // the first ID command which indicates the beginning of binary + // inline image content. Then test what remains for valid + // UTF-8. If it's not UTF-8, return an empty string as this + // $content is most likely binary. Unfortunately, using + // mb_check_encoding(..., 'UTF-8') is not strict enough, so the + // following regexp, adapted from the W3, is used. See: + // https://www.w3.org/International/questions/qa-forms-utf-8.en + // We use preg_replace() instead of preg_match() to avoid "JIT + // stack limit exhausted" errors on larger files. + $utf8Filter = preg_replace('/( + [\x09\x0A\x0D\x20-\x7E] | # ASCII + [\xC2-\xDF][\x80-\xBF] | # non-overlong 2-byte + \xE0[\xA0-\xBF][\x80-\xBF] | # excluding overlongs + [\xE1-\xEC\xEE\xEF][\x80-\xBF]{2} | # straight 3-byte + \xED[\x80-\x9F][\x80-\xBF] | # excluding surrogates + \xF0[\x90-\xBF][\x80-\xBF]{2} | # planes 1-3 + [\xF1-\xF3][\x80-\xBF]{3} | # planes 4-15 + \xF4[\x80-\x8F][\x80-\xBF]{2} # plane 16 + )/xs', '', preg_replace('/(\(|ID\s).*$/s', '', $content)); + + if ('' !== $utf8Filter) { + return ''; + } + + // Find all inline image content and replace them so they aren't + // affected by the next steps + $pdfInlineImages = []; + $offsetBI = 0; + while (preg_match('/\sBI\s(\/.+?)\sID\s(.+?)\sEI(?=\s|$)/s', $content, $text, \PREG_OFFSET_CAPTURE, $offsetBI)) { + // Attempt to detemine if this instance of the 'BI' command + // actually occured within a (string) using the following + // steps: + + // Step 1: Remove any escaped slashes and parentheses from + // the alleged image characteristics data + $para = str_replace(['\\\\', '\\(', '\\)'], '', $text[1][0]); + + // Step 2: Remove all correctly ordered and balanced + // parentheses from (strings) + do { + $paraTest = $para; + $para = preg_replace('/\(([^()]*)\)/', '$1', $paraTest); + } while ($para != $paraTest); + + $paraOpen = strpos($para, '('); + $paraClose = strpos($para, ')'); + + // Check: If the remaining text contains a close parenthesis + // ')' AND it occurs before any open parenthesis, then we + // are almost certain to be inside a (string) + if (0 < $paraClose && (false === $paraOpen || $paraClose < $paraOpen)) { + // Bump the search offset forward and match again + $offsetBI = (int) $text[1][1]; + continue; + } + + // Step 3: Double check that this is actually inline image + // data by parsing the alleged image characteristics as a + // dictionary + $dict = $this->parseDictionary('<<'.$text[1][0].'>>'); + + // Check if an image Width and Height are set in the dict + if ((isset($dict['W']) || isset($dict['Width'])) + && (isset($dict['H']) || isset($dict['Height']))) { + $id = uniqid('IMAGE_', true); + $pdfInlineImages[$id] = [ + preg_replace(['/\r\n/', '/\r/', '/\n/'], ' ', $text[1][0]), + preg_replace(['/\r\n/', '/\r/', '/\n/'], '', $text[2][0]), + ]; + $content = preg_replace( + '/'.preg_quote($text[0][0], '/').'/', + '^^^'.$id.'^^^', + $content, + 1 + ); + } else { + // If there was no valid dictionary, or a height and width + // weren't specified, then we don't know what this is, so + // just leave it alone; bump the search offset forward and + // match again + $offsetBI = (int) $text[1][1]; + } + } + + // Find all strings () and replace them so they aren't affected + // by the next steps + $pdfstrings = []; + $attempt = '('; + while (preg_match('/'.preg_quote($attempt, '/').'.*?\)/s', $content, $text)) { + // Remove all escaped slashes and parentheses from the target text + $para = str_replace(['\\\\', '\\(', '\\)'], '', $text[0]); + + // PDF strings can contain unescaped parentheses as long as + // they're balanced, so check for balanced parentheses + $left = preg_match_all('/\(/', $para); + $right = preg_match_all('/\)/', $para); + + if (')' == $para[-1] && $left == $right) { + // Replace the string with a unique placeholder + $id = uniqid('STRING_', true); + $pdfstrings[$id] = $text[0]; + $content = preg_replace( + '/'.preg_quote($text[0], '/').'/', + '@@@'.$id.'@@@', + $content, + 1 + ); + + // Reset to search for the next string + $attempt = '('; + } else { + // We had unbalanced parentheses, so use the current + // match as a base to find a longer string + $attempt = $text[0]; + } + } + + // Remove all carriage returns and line-feeds from the document stream + $content = str_replace(["\r", "\n"], ' ', trim($content)); + + // Find all dictionary << >> commands and replace them so they + // aren't affected by the next steps + $dictstore = []; + while (preg_match('/(<<.*?>> *)(BDC|BMC|DP|MP)/s', $content, $dicttext)) { + $dictid = uniqid('DICT_', true); + $dictstore[$dictid] = $dicttext[1]; + $content = preg_replace( + '/'.preg_quote($dicttext[0], '/').'/', + ' ###'.$dictid.'###'.$dicttext[2], + $content, + 1 + ); + } + + // Normalize white-space in the document stream + $content = preg_replace('/\s{2,}/', ' ', $content); + + // Find all valid PDF operators and add \r\n after each; this + // ensures there is just one command on every line + // Source: https://ia801001.us.archive.org/1/items/pdf1.7/pdf_reference_1-7.pdf - Appendix A + // Source: https://archive.org/download/pdf320002008/PDF32000_2008.pdf - Annex A + // Note: PDF Reference 1.7 lists 'I' and 'rI' as valid commands, while + // PDF 32000:2008 lists them as 'i' and 'ri' respectively. Both versions + // appear here in the list for completeness. + $operators = [ + 'b*', 'b', 'BDC', 'BMC', 'B*', 'BI', 'BT', 'BX', 'B', 'cm', 'cs', 'c', 'CS', + 'd0', 'd1', 'd', 'Do', 'DP', 'EMC', 'EI', 'ET', 'EX', 'f*', 'f', 'F', 'gs', + 'g', 'G', 'h', 'i', 'ID', 'I', 'j', 'J', 'k', 'K', 'l', 'm', 'MP', 'M', 'n', + 'q', 'Q', 're', 'rg', 'ri', 'rI', 'RG', 'scn', 'sc', 'sh', 's', 'SCN', 'SC', + 'S', 'T*', 'Tc', 'Td', 'TD', 'Tf', 'TJ', 'Tj', 'TL', 'Tm', 'Tr', 'Ts', 'Tw', + 'Tz', 'v', 'w', 'W*', 'W', 'y', '\'', '"', + ]; + foreach ($operators as $operator) { + $content = preg_replace( + '/(?> commands + $dictstore = array_reverse($dictstore, true); + foreach ($dictstore as $id => $dict) { + $content = str_replace('###'.$id.'###', $dict, $content); + } + + // Restore the original string content + $pdfstrings = array_reverse($pdfstrings, true); + foreach ($pdfstrings as $id => $text) { + // Strings may contain escaped newlines, or literal newlines + // and we should clean these up before replacing the string + // back into the content stream; this ensures no strings are + // split between two lines (every command must be on one line) + $text = str_replace( + ["\\\r\n", "\\\r", "\\\n", "\r", "\n"], + ['', '', '', '\r', '\n'], + $text + ); + + $content = str_replace('@@@'.$id.'@@@', $text, $content); + } + + // Restore the original content of any inline images + $pdfInlineImages = array_reverse($pdfInlineImages, true); + foreach ($pdfInlineImages as $id => $image) { + $content = str_replace( + '^^^'.$id.'^^^', + "\r\nBI\r\n".$image[0]." ID\r\n".$image[1]." EI\r\n", + $content + ); + } + + $content = trim(preg_replace(['/(\r\n){2,}/', '/\r\n +/'], "\r\n", $content)); + + return $content; + } + + /** + * getSectionsText() now takes an entire, unformatted + * document stream as a string, cleans it, then filters out + * commands that aren't needed for text positioning/extraction. It + * returns an array of unprocessed PDF commands, one command per + * element. + * + * @internal + */ + public function getSectionsText(?string $content): array + { + $sections = []; + + // A cleaned stream has one command on every line, so split the + // cleaned stream content on \r\n into an array + $textCleaned = preg_split( + '/(\r\n|\n|\r)/', + $this->formatContent($content), + -1, + \PREG_SPLIT_NO_EMPTY + ); + + $inTextBlock = false; + foreach ($textCleaned as $line) { + $line = trim($line); + + // Skip empty lines + if ('' === $line) { + continue; + } + + // If a 'BT' is encountered, set the $inTextBlock flag + if (preg_match('/BT$/', $line)) { + $inTextBlock = true; + $sections[] = $line; + + // If an 'ET' is encountered, unset the $inTextBlock flag + } elseif ('ET' == $line) { + $inTextBlock = false; + $sections[] = $line; + } elseif ($inTextBlock) { + // If we are inside a BT ... ET text block, save all lines + $sections[] = trim($line); + } else { + // Otherwise, if we are outside of a text block, only + // save specific, necessary lines. Care should be taken + // to ensure a command being checked for *only* matches + // that command. For instance, a simple search for 'c' + // may also match the 'sc' command. See the command + // list in the formatContent() method above. + // Add more commands to save here as you find them in + // weird PDFs! + if ('q' == $line[-1] || 'Q' == $line[-1]) { + // Save and restore graphics state commands + $sections[] = $line; + } elseif (preg_match('/(?getFonts(); + } + + $firstFont = $this->document->getFirstFont(); + if (null !== $firstFont) { + $fonts[] = $firstFont; + } + + if (\count($fonts) > 0) { + return reset($fonts); + } + + return new Font($this->document, null, null, $this->config); + } + + /** + * Decode a '[]TJ' command and attempt to use alternate + * fonts if the current font results in output that contains + * Unicode control characters. + * + * @internal + * + * @param array> $command + */ + private function getTJUsingFontFallback(Font $font, array $command, ?Page $page = null, float $fontFactor = 4): string + { + $orig_text = $font->decodeText($command, $fontFactor); + $text = $orig_text; + + // If we make this a Config option, we can add a check if it's + // enabled here. + if (null !== $page) { + $font_ids = array_keys($page->getFonts()); + + // If the decoded text contains UTF-8 control characters + // then the font page being used is probably the wrong one. + // Loop through the rest of the fonts to see if we can get + // a good decode. Allow x09 to x0d which are whitespace. + while (preg_match('/[\x00-\x08\x0e-\x1f\x7f]/u', $text) || false !== strpos(bin2hex($text), '00')) { + // If we're out of font IDs, then give up and use the + // original string + if (0 == \count($font_ids)) { + return $orig_text; + } + + // Try the next font ID + $font = $page->getFont(array_shift($font_ids)); + $text = $font->decodeText($command, $fontFactor); + } + } + + return $text; + } + + /** + * Expects a string that is a full PDF dictionary object, + * including the outer enclosing << >> angle brackets + * + * @internal + * + * @throws InvalidDictionaryObjectException + */ + public function parseDictionary(string $dictionary): array + { + // Normalize whitespace + $dictionary = preg_replace(['/\r/', '/\n/', '/\s{2,}/'], ' ', trim($dictionary)); + + if ('<<' != substr($dictionary, 0, 2)) { + throw new InvalidDictionaryObjectException('Not a valid dictionary object.'); + } + + $parsed = []; + $stack = []; + $currentName = ''; + $arrayTypeNumeric = false; + + // Remove outer layer of dictionary, and split on tokens + $split = preg_split( + '/(<<|>>|\[|\]|\/[^\s\/\[\]\(\)<>]*)/', + trim(preg_replace('/^<<|>>$/', '', $dictionary)), + -1, + \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE + ); + + foreach ($split as $token) { + $token = trim($token); + switch ($token) { + case '': + break; + + // Open numeric array + case '[': + $parsed[$currentName] = []; + $arrayTypeNumeric = true; + + // Move up one level in the stack + $stack[\count($stack)] = &$parsed; + $parsed = &$parsed[$currentName]; + $currentName = ''; + break; + + // Open hashed array + case '<<': + $parsed[$currentName] = []; + $arrayTypeNumeric = false; + + // Move up one level in the stack + $stack[\count($stack)] = &$parsed; + $parsed = &$parsed[$currentName]; + $currentName = ''; + break; + + // Close numeric array + case ']': + // Revert string type arrays back to a single element + if (\is_array($parsed) && 1 == \count($parsed) + && isset($parsed[0]) && \is_string($parsed[0]) + && '' !== $parsed[0] && '/' != $parsed[0][0]) { + $parsed = '['.$parsed[0].']'; + } + // Close hashed array + // no break + case '>>': + $arrayTypeNumeric = false; + + // Move down one level in the stack + $parsed = &$stack[\count($stack) - 1]; + unset($stack[\count($stack) - 1]); + break; + + default: + // If value begins with a slash, then this is a name + // Add it to the appropriate array + if ('/' == substr($token, 0, 1)) { + $currentName = substr($token, 1); + if (true == $arrayTypeNumeric) { + $parsed[] = $currentName; + $currentName = ''; + } + } elseif ('' != $currentName) { + if (false == $arrayTypeNumeric) { + $parsed[$currentName] = $token; + } + $currentName = ''; + } elseif ('' == $currentName) { + $parsed[] = $token; + } + } + } + + return $parsed; + } + + /** + * Returns the text content of a PDF as a string. Attempts to add + * whitespace for spacing and line-breaks where appropriate. + * + * getText() leverages getTextArray() to get the content + * of the document, setting the addPositionWhitespace flag to true + * so whitespace is inserted in a logical way for reading by + * humans. + */ + public function getText(?Page $page = null): string + { + $this->addPositionWhitespace = true; + $result = $this->getTextArray($page); + $this->addPositionWhitespace = false; + + return implode('', $result).' '; + } + + /** + * Returns the text content of a PDF as an array of strings. No + * extra whitespace is inserted besides what is actually encoded in + * the PDF text. + * + * @throws \Exception + */ + public function getTextArray(?Page $page = null): array + { + $result = []; + $text = []; + + $marked_stack = []; + $last_written_position = false; + + $sections = $this->getSectionsText($this->content); + $current_font = $this->getDefaultFont($page); + $current_font_size = 1; + $current_text_leading = 0; + + $current_position = ['x' => false, 'y' => false]; + $current_position_tm = [ + 'a' => 1, 'b' => 0, 'c' => 0, + 'i' => 0, 'j' => 1, 'k' => 0, + 'x' => 0, 'y' => 0, 'z' => 1, + ]; + $current_position_td = ['x' => 0, 'y' => 0]; + $current_position_cm = [ + 'a' => 1, 'b' => 0, 'c' => 0, + 'i' => 0, 'j' => 1, 'k' => 0, + 'x' => 0, 'y' => 0, 'z' => 1, + ]; + + $clipped_font = []; + $clipped_position_cm = []; + + self::$recursionStack[] = $this->getUniqueId(); + + foreach ($sections as $section) { + $commands = $this->getCommandsText($section); + foreach ($commands as $command) { + switch ($command[self::OPERATOR]) { + // Begin text object + case 'BT': + // Reset text positioning matrices + $current_position_tm = [ + 'a' => 1, 'b' => 0, 'c' => 0, + 'i' => 0, 'j' => 1, 'k' => 0, + 'x' => 0, 'y' => 0, 'z' => 1, + ]; + $current_position_td = ['x' => 0, 'y' => 0]; + $current_text_leading = 0; + break; + + // Begin marked content sequence with property list + case 'BDC': + if (preg_match('/(<<.*>>)$/', $command[self::COMMAND], $match)) { + $dict = $this->parseDictionary($match[1]); + + // Check for ActualText block + if (isset($dict['ActualText']) && \is_string($dict['ActualText']) && '' !== $dict['ActualText']) { + if ('[' == $dict['ActualText'][0]) { + // Simulate a 'TJ' command on the stack + $marked_stack[] = [ + 'ActualText' => $this->getCommandsText($dict['ActualText'].'TJ')[0], + ]; + } elseif ('<' == $dict['ActualText'][0] || '(' == $dict['ActualText'][0]) { + // Simulate a 'Tj' command on the stack + $marked_stack[] = [ + 'ActualText' => $this->getCommandsText($dict['ActualText'].'Tj')[0], + ]; + } + } + } + break; + + // Begin marked content sequence + case 'BMC': + if ('ReversedChars' == $command[self::COMMAND]) { + // Upon encountering a ReversedChars command, + // add the characters we've built up so far to + // the result array + $result = array_merge($result, $text); + + // Start a fresh $text array that will contain + // reversed characters + $text = []; + + // Add the reversed text flag to the stack + $marked_stack[] = ['ReversedChars' => true]; + } + break; + + // set graphics position matrix + case 'cm': + $args = preg_split('/\s+/s', $command[self::COMMAND]); + $current_position_cm = [ + 'a' => (float) $args[0], 'b' => (float) $args[1], 'c' => 0, + 'i' => (float) $args[2], 'j' => (float) $args[3], 'k' => 0, + 'x' => (float) $args[4], 'y' => (float) $args[5], 'z' => 1, + ]; + break; + + case 'Do': + if (is_null($page)) { + break; + } + + $args = preg_split('/\s/s', $command[self::COMMAND]); + $id = trim(array_pop($args), '/ '); + $xobject = $page->getXObject($id); + + // Check we got a PDFObject back. + if (!$xobject instanceof self) { + break; + } + + // If the PDFObject is an Image or a Form, do nothing as + // neither of these XObject types are text. + if ($xobject instanceof Image || $xobject instanceof Form) { + break; + } + + // Check this is not a circular reference. + if (!\in_array($xobject->getUniqueId(), self::$recursionStack, true)) { + $text[] = $xobject->getText($page); + } + break; + + // Marked content point with (DP) & without (MP) property list + case 'DP': + case 'MP': + break; + + // End text object + case 'ET': + break; + + // Store current selected font and graphics matrix + case 'q': + $clipped_font[] = [$current_font, $current_font_size]; + $clipped_position_cm[] = $current_position_cm; + break; + + // Restore previous selected font and graphics matrix + case 'Q': + list($current_font, $current_font_size) = array_pop($clipped_font); + $current_position_cm = array_pop($clipped_position_cm); + break; + + // End marked content sequence + case 'EMC': + $data = false; + if (\count($marked_stack)) { + $marked = array_pop($marked_stack); + $action = key($marked); + $data = $marked[$action]; + + switch ($action) { + // If we are in ReversedChars mode... + case 'ReversedChars': + // Reverse the characters we've built up so far + foreach ($text as $key => $t) { + $text[$key] = implode('', array_reverse( + mb_str_split($t, 1, mb_internal_encoding()) + )); + } + + // Add these characters to the result array + $result = array_merge($result, $text); + + // Start a fresh $text array that will contain + // non-reversed characters + $text = []; + break; + + case 'ActualText': + // Use the content of the ActualText as a command + $command = $data; + break; + } + } + + // If this EMC command has been transformed into a 'Tj' + // or 'TJ' command because of being ActualText, then bypass + // the break to proceed to the writing section below. + if ('Tj' != $command[self::OPERATOR] && 'TJ' != $command[self::OPERATOR]) { + break; + } + + // no break + case "'": + case '"': + if ("'" == $command[self::OPERATOR] || '"' == $command[self::OPERATOR]) { + // Move to next line and write text + $current_position['x'] = 0; + $current_position_td['x'] = 0; + $current_position_td['y'] += $current_text_leading; + } + // no break + case 'Tj': + $command[self::COMMAND] = [$command]; + // no break + case 'TJ': + // Check the marked content stack for flags + $actual_text = false; + $reverse_text = false; + foreach ($marked_stack as $marked) { + if (isset($marked['ActualText'])) { + $actual_text = true; + } + if (isset($marked['ReversedChars'])) { + $reverse_text = true; + } + } + + // Account for text position ONLY just before we write text + if (false === $actual_text && \is_array($last_written_position)) { + // If $last_written_position is an array, that + // means we have stored text position coordinates + // for placing an ActualText + $currentX = $last_written_position[0]; + $currentY = $last_written_position[1]; + $last_written_position = false; + } else { + $currentX = $current_position_cm['x'] + $current_position_tm['x'] + $current_position_td['x']; + $currentY = $current_position_cm['y'] + $current_position_tm['y'] + $current_position_td['y']; + } + $whiteSpace = ''; + + $factorX = -$current_font_size * $current_position_tm['a'] - $current_font_size * $current_position_tm['i']; + $factorY = $current_font_size * $current_position_tm['b'] + $current_font_size * $current_position_tm['j']; + + if (true === $this->addPositionWhitespace && false !== $current_position['x']) { + $curY = $currentY - $current_position['y']; + if (abs($curY) >= abs($factorY) / 4) { + $whiteSpace = "\n"; + } else { + if (true === $reverse_text) { + $curX = $current_position['x'] - $currentX; + } else { + $curX = $currentX - $current_position['x']; + } + + // In abs($factorX * 7) below, the 7 is chosen arbitrarily + // as the number of apparent "spaces" in a document we + // would need before considering them a "tab". In the + // future, we might offer this value to users as a config + // option. + if ($curX >= abs($factorX * 7)) { + $whiteSpace = "\t"; + } elseif ($curX >= abs($factorX * 2)) { + $whiteSpace = ' '; + } + } + } + + $newtext = $this->getTJUsingFontFallback( + $current_font, + $command[self::COMMAND], + $page, + $factorX + ); + + // If there is no ActualText pending then write + if (false === $actual_text) { + $newtext = str_replace(["\r", "\n"], '', $newtext); + if (false !== $reverse_text) { + // If we are in ReversedChars mode, add the whitespace last + $text[] = preg_replace('/ $/', ' ', $newtext.$whiteSpace); + } else { + // Otherwise add the whitespace first + if (' ' === $whiteSpace && isset($text[\count($text) - 1])) { + $text[\count($text) - 1] = preg_replace('/ $/', '', $text[\count($text) - 1]); + } + $text[] = preg_replace('/^[ \t]{2}/', ' ', $whiteSpace.$newtext); + } + + // Record the position of this inserted text for comparison + // with the next text block. + // Provide a 'fudge' factor guess on how wide this text block + // is based on the number of characters. This helps limit the + // number of tabs inserted, but isn't perfect. + $factor = $factorX / 2; + $current_position = [ + 'x' => $currentX - mb_strlen($newtext) * $factor, + 'y' => $currentY, + ]; + } elseif (false === $last_written_position) { + // If there is an ActualText in the pipeline + // store the position this undisplayed text + // *would* have been written to, so the + // ActualText is displayed in the right spot + $last_written_position = [$currentX, $currentY]; + $current_position['x'] = $currentX; + } + break; + + // move to start of next line + case 'T*': + $current_position['x'] = 0; + $current_position_td['x'] = 0; + $current_position_td['y'] += $current_text_leading; + break; + + // set character spacing + case 'Tc': + break; + + // move text current point and set leading + case 'Td': + case 'TD': + // move text current point + $args = preg_split('/\s+/s', $command[self::COMMAND]); + $y = (float) array_pop($args); + $x = (float) array_pop($args); + + if ('TD' == $command[self::OPERATOR]) { + $current_text_leading = -$y * $current_position_tm['b'] - $y * $current_position_tm['j']; + } + + $current_position_td = [ + 'x' => $current_position_td['x'] + $x * $current_position_tm['a'] + $x * $current_position_tm['i'], + 'y' => $current_position_td['y'] + $y * $current_position_tm['b'] + $y * $current_position_tm['j'], + ]; + break; + + case 'Tf': + $args = preg_split('/\s/s', $command[self::COMMAND]); + $size = (float) array_pop($args); + $id = trim(array_pop($args), '/'); + if (null !== $page) { + $new_font = $page->getFont($id); + // If an invalid font ID is given, do not update the font. + // This should theoretically never happen, as the PDF spec states for the Tf operator: + // "The specified font value shall match a resource name in the Font entry of the default resource dictionary" + // (https://www.adobe.com/content/dam/acom/en/devnet/pdf/pdfs/PDF32000_2008.pdf, page 435) + // But we want to make sure that malformed PDFs do not simply crash. + if (null !== $new_font) { + $current_font = $new_font; + $current_font_size = $size; + } + } + break; + + // set leading + case 'TL': + $y = (float) $command[self::COMMAND]; + $current_text_leading = -$y * $current_position_tm['b'] + -$y * $current_position_tm['j']; + break; + + // set text position matrix + case 'Tm': + $args = preg_split('/\s+/s', $command[self::COMMAND]); + $current_position_tm = [ + 'a' => (float) $args[0], 'b' => (float) $args[1], 'c' => 0, + 'i' => (float) $args[2], 'j' => (float) $args[3], 'k' => 0, + 'x' => (float) $args[4], 'y' => (float) $args[5], 'z' => 1, + ]; + break; + + // set text rendering mode + case 'Ts': + break; + + // set super/subscripting text rise + case 'Ts': + break; + + // set word spacing + case 'Tw': + break; + + // set horizontal scaling + case 'Tz': + break; + + default: + } + } + } + + $result = array_merge($result, $text); + + return $result; + } + + /** + * getCommandsText() expects the content of $text_part to be an + * already formatted, single-line command from a document stream. + * The companion function getSectionsText() returns a document + * stream as an array of single commands for just this purpose. + * Because of this, the argument $offset is no longer used, and + * may be removed in a future PdfParser release. + * + * A better name for this function would be getCommandText() + * since it now always works on just one command. + */ + public function getCommandsText(string $text_part, int &$offset = 0): array + { + $commands = $matches = []; + + preg_match('/^(([\/\[\(<])?.*)(? '(', + self::OPERATOR => 'TJ', + self::COMMAND => $tjmatch[1], + ]; + if (isset($tjmatch[2]) && trim($tjmatch[2])) { + $subcommand[] = [ + self::TYPE => 'n', + self::OPERATOR => '', + self::COMMAND => $tjmatch[2], + ]; + } + $command = substr($command, \strlen($tjmatch[0])); + } + + // Search for hexadecimal <> format + if (preg_match('/^ *<([0-9a-f\s]*)> *(-?[\d.]+)?/i', $command, $tjmatch)) { + $tjmatch[1] = preg_replace('/\s/', '', $tjmatch[1]); + $subcommand[] = [ + self::TYPE => '<', + self::OPERATOR => 'TJ', + self::COMMAND => $tjmatch[1], + ]; + if (isset($tjmatch[2]) && trim($tjmatch[2])) { + $subcommand[] = [ + self::TYPE => 'n', + self::OPERATOR => '', + self::COMMAND => $tjmatch[2], + ]; + } + $command = substr($command, \strlen($tjmatch[0])); + } + } while ($command != $oldCommand); + + $command = $subcommand; + } elseif ('Tj' == $operator || "'" == $operator || '"' == $operator) { + // Depending on the string type, trim the data of the + // appropriate delimiters + if ('(' == $type) { + // Don't use trim() here since a () string may end with + // a balanced or escaped right parentheses, and trim() + // will delete both. Both strings below are valid: + // eg. (String()) + // eg. (String\)) + $command = preg_replace('/^\(|\)$/', '', $command); + } elseif ('<' == $type) { + $command = trim($command, '<>'); + } + } elseif ('/' == $type) { + $command = substr($command, 1); + } + + $commands[] = [ + self::TYPE => $type, + self::OPERATOR => $operator, + self::COMMAND => $command, + ]; + + return $commands; + } + + public static function factory( + Document $document, + Header $header, + ?string $content, + ?Config $config = null + ): self { + switch ($header->get('Type')->getContent()) { + case 'XObject': + switch ($header->get('Subtype')->getContent()) { + case 'Image': + return new Image($document, $header, $config->getRetainImageContent() ? $content : null, $config); + + case 'Form': + return new Form($document, $header, $content, $config); + } + + return new self($document, $header, $content, $config); + + case 'Pages': + return new Pages($document, $header, $content, $config); + + case 'Page': + return new Page($document, $header, $content, $config); + + case 'Encoding': + return new Encoding($document, $header, $content, $config); + + case 'Font': + $subtype = $header->get('Subtype')->getContent(); + $classname = '\Smalot\PdfParser\Font\Font'.$subtype; + + if (class_exists($classname)) { + return new $classname($document, $header, $content, $config); + } + + return new Font($document, $header, $content, $config); + + default: + return new self($document, $header, $content, $config); + } + } + + /** + * Returns unique id identifying the object. + */ + protected function getUniqueId(): string + { + return spl_object_hash($this); + } +} diff --git a/src/Smalot/PdfParser/Page.php b/src/Smalot/PdfParser/Page.php index 496f0642..1bd29e1e 100644 --- a/src/Smalot/PdfParser/Page.php +++ b/src/Smalot/PdfParser/Page.php @@ -5,64 +5,82 @@ * This file is part of the PdfParser library. * * @author Sébastien MALOT - * @date 2013-08-08 - * @license GPL-3.0 + * + * @date 2017-01-03 + * + * @license LGPLv3 + * * @url * * PdfParser is a pdf library written in PHP, extraction oriented. - * Copyright (C) 2014 - Sébastien MALOT + * Copyright (C) 2017 - Sébastien MALOT * * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by + * it under the terms of the GNU Lesser General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. + * GNU Lesser General Public License for more details. * - * You should have received a copy of the GNU General Public License + * You should have received a copy of the GNU Lesser General Public License * along with this program. * If not, see . - * */ namespace Smalot\PdfParser; use Smalot\PdfParser\Element\ElementArray; use Smalot\PdfParser\Element\ElementMissing; +use Smalot\PdfParser\Element\ElementNull; use Smalot\PdfParser\Element\ElementXRef; -/** - * Class Page - * - * @package Smalot\PdfParser - */ -class Page extends Object +class Page extends PDFObject { /** * @var Font[] */ - protected $fonts = null; + protected $fonts; /** - * @var Object[] + * @var PDFObject[] */ - protected $xobjects = null; + protected $xobjects; + + /** + * @var array + */ + protected $dataTm; + + /** + * @param array<\Smalot\PdfParser\Font> $fonts + * + * @internal + */ + public function setFonts($fonts) + { + if (empty($this->fonts)) { + $this->fonts = $fonts; + } + } /** * @return Font[] */ public function getFonts() { - if (!is_null($this->fonts)) { + if (null !== $this->fonts) { return $this->fonts; } $resources = $this->get('Resources'); - if ($resources->has('Font')) { + if (method_exists($resources, 'has') && $resources->has('Font')) { + if ($resources->get('Font') instanceof ElementMissing) { + return []; + } if ($resources->get('Font') instanceof Header) { $fonts = $resources->get('Font')->getElements(); @@ -70,120 +88,114 @@ public function getFonts() $fonts = $resources->get('Font')->getHeader()->getElements(); } - $table = array(); + $table = []; foreach ($fonts as $id => $font) { - $table[$id] = $font; - - // Store too on cleaned id value (only numeric) - $id = preg_replace('/[^0-9\.\-_]/', '', $id); - if ($id != '') { + if ($font instanceof Font) { $table[$id] = $font; + + // Store too on cleaned id value (only numeric) + $id = preg_replace('/[^0-9\.\-_]/', '', $id); + if ('' != $id) { + $table[$id] = $font; + } } } - return ($this->fonts = $table); - } else { - return array(); + return $this->fonts = $table; } + + return []; } - /** - * @param string $id - * - * @return Font - */ - public function getFont($id) + public function getFont(string $id): ?Font { $fonts = $this->getFonts(); + if (isset($fonts[$id])) { + return $fonts[$id]; + } + + // According to the PDF specs (https://www.adobe.com/content/dam/acom/en/devnet/pdf/pdfs/PDF32000_2008.pdf, page 238) + // "The font resource name presented to the Tf operator is arbitrary, as are the names for all kinds of resources" + // Instead, we search for the unfiltered name first and then do this cleaning as a fallback, so all tests still pass. + if (isset($fonts[$id])) { return $fonts[$id]; } else { $id = preg_replace('/[^0-9\.\-_]/', '', $id); - if (isset($fonts[$id])) { return $fonts[$id]; - } else { - return null; } } + + return null; } /** * Support for XObject * - * @return Object[] + * @return PDFObject[] */ public function getXObjects() { - if (!is_null($this->xobjects)) { + if (null !== $this->xobjects) { return $this->xobjects; } $resources = $this->get('Resources'); - if ($resources->has('XObject')) { - + if (method_exists($resources, 'has') && $resources->has('XObject')) { if ($resources->get('XObject') instanceof Header) { $xobjects = $resources->get('XObject')->getElements(); } else { $xobjects = $resources->get('XObject')->getHeader()->getElements(); } - $table = array(); + $table = []; foreach ($xobjects as $id => $xobject) { $table[$id] = $xobject; // Store too on cleaned id value (only numeric) $id = preg_replace('/[^0-9\.\-_]/', '', $id); - if ($id != '') { + if ('' != $id) { $table[$id] = $xobject; } } - return ($this->xobjects = $table); - } else { - return array(); + return $this->xobjects = $table; } + + return []; } - /** - * @param string $id - * - * @return Object - */ - public function getXObject($id) + public function getXObject(string $id): ?PDFObject { $xobjects = $this->getXObjects(); + if (isset($xobjects[$id])) { + return $xobjects[$id]; + } + + return null; + /*$id = preg_replace('/[^0-9\.\-_]/', '', $id); + if (isset($xobjects[$id])) { return $xobjects[$id]; } else { return null; - /*$id = preg_replace('/[^0-9\.\-_]/', '', $id); - - if (isset($xobjects[$id])) { - return $xobjects[$id]; - } else { - return null; - }*/ - } + }*/ } - /** - * @param Page - * - * @return string - */ - public function getText(Page $page = null) + public function getText(?self $page = null): string { if ($contents = $this->get('Contents')) { - if ($contents instanceof ElementMissing) { return ''; - } elseif ($contents instanceof Object) { + } elseif ($contents instanceof ElementNull) { + return ''; + } elseif ($contents instanceof PDFObject) { $elements = $contents->getHeader()->getElements(); if (is_numeric(key($elements))) { @@ -197,24 +209,806 @@ public function getText(Page $page = null) } } - $header = new Header(array(), $this->document); - $contents = new Object($this->document, $header, $new_content); + $header = new Header([], $this->document); + $contents = new PDFObject($this->document, $header, $new_content, $this->config); } } elseif ($contents instanceof ElementArray) { // Create a virtual global content. $new_content = ''; foreach ($contents->getContent() as $content) { - $new_content .= $content->getContent() . "\n"; + $new_content .= $content->getContent()."\n"; } - $header = new Header(array(), $this->document); - $contents = new Object($this->document, $header, $new_content); + $header = new Header([], $this->document); + $contents = new PDFObject($this->document, $header, $new_content, $this->config); } - return $contents->getText($this); + /* + * Elements referencing each other on the same page can cause endless loops during text parsing. + * To combat this we keep a recursionStack containing already parsed elements on the page. + * The stack is only emptied here after getting text from a page. + */ + $contentsText = $contents->getText($this); + PDFObject::$recursionStack = []; + + return $contentsText; } return ''; } + + /** + * Return true if the current page is a (setasign\Fpdi\Fpdi) FPDI/FPDF document + * + * The metadata 'Producer' should have the value of "FPDF" . FPDF_VERSION if the + * pdf file was generated by FPDF/Fpfi. + * + * @return bool true is the current page is a FPDI/FPDF document + */ + public function isFpdf(): bool + { + if (\array_key_exists('Producer', $this->document->getDetails()) + && \is_string($this->document->getDetails()['Producer']) + && 0 === strncmp($this->document->getDetails()['Producer'], 'FPDF', 4)) { + return true; + } + + return false; + } + + /** + * Return the page number of the PDF document of the page object + * + * @return int the page number + */ + public function getPageNumber(): int + { + $pages = $this->document->getPages(); + $numOfPages = \count($pages); + for ($pageNum = 0; $pageNum < $numOfPages; ++$pageNum) { + if ($pages[$pageNum] === $this) { + break; + } + } + + return $pageNum; + } + + /** + * Return the Object of the page if the document is a FPDF/FPDI document + * + * If the document was generated by FPDF/FPDI it returns the + * PDFObject of the given page + * + * @return PDFObject The PDFObject for the page + */ + public function getPDFObjectForFpdf(): PDFObject + { + $pageNum = $this->getPageNumber(); + $xObjects = $this->getXObjects(); + + return $xObjects[$pageNum]; + } + + /** + * Return a new PDFObject of the document created with FPDF/FPDI + * + * For a document generated by FPDF/FPDI, it generates a + * new PDFObject for that document + * + * @return PDFObject The PDFObject + */ + public function createPDFObjectForFpdf(): PDFObject + { + $pdfObject = $this->getPDFObjectForFpdf(); + $new_content = $pdfObject->getContent(); + $header = $pdfObject->getHeader(); + $config = $pdfObject->config; + + return new PDFObject($pdfObject->document, $header, $new_content, $config); + } + + /** + * Return page if document is a FPDF/FPDI document + * + * @return Page The page + */ + public function createPageForFpdf(): self + { + $pdfObject = $this->getPDFObjectForFpdf(); + $new_content = $pdfObject->getContent(); + $header = $pdfObject->getHeader(); + $config = $pdfObject->config; + + return new self($pdfObject->document, $header, $new_content, $config); + } + + public function getTextArray(?self $page = null): array + { + if ($this->isFpdf()) { + $pdfObject = $this->getPDFObjectForFpdf(); + $newPdfObject = $this->createPDFObjectForFpdf(); + + return $newPdfObject->getTextArray($pdfObject); + } else { + if ($contents = $this->get('Contents')) { + if ($contents instanceof ElementMissing) { + return []; + } elseif ($contents instanceof ElementNull) { + return []; + } elseif ($contents instanceof PDFObject) { + $elements = $contents->getHeader()->getElements(); + + if (is_numeric(key($elements))) { + $new_content = ''; + + /** @var PDFObject $element */ + foreach ($elements as $element) { + if ($element instanceof ElementXRef) { + $new_content .= $element->getObject()->getContent(); + } else { + $new_content .= $element->getContent(); + } + } + + $header = new Header([], $this->document); + $contents = new PDFObject($this->document, $header, $new_content, $this->config); + } else { + try { + $contents->getTextArray($this); + } catch (\Throwable $e) { + return $contents->getTextArray(); + } + } + } elseif ($contents instanceof ElementArray) { + // Create a virtual global content. + $new_content = ''; + + /** @var PDFObject $content */ + foreach ($contents->getContent() as $content) { + $new_content .= $content->getContent()."\n"; + } + + $header = new Header([], $this->document); + $contents = new PDFObject($this->document, $header, $new_content, $this->config); + } + + return $contents->getTextArray($this); + } + + return []; + } + } + + /** + * Gets all the text data with its internal representation of the page. + * + * Returns an array with the data and the internal representation + */ + public function extractRawData(): array + { + /* + * Now you can get the complete content of the object with the text on it + */ + $extractedData = []; + $content = $this->get('Contents'); + $values = $content->getContent(); + if (isset($values) && \is_array($values)) { + $text = ''; + foreach ($values as $section) { + $text .= $section->getContent(); + } + $sectionsText = $this->getSectionsText($text); + foreach ($sectionsText as $sectionText) { + $commandsText = $this->getCommandsText($sectionText); + foreach ($commandsText as $command) { + $extractedData[] = $command; + } + } + } else { + if ($this->isFpdf()) { + $content = $this->getPDFObjectForFpdf(); + } + $sectionsText = $content->getSectionsText($content->getContent()); + foreach ($sectionsText as $sectionText) { + $commandsText = $content->getCommandsText($sectionText); + foreach ($commandsText as $command) { + $extractedData[] = $command; + } + } + } + + return $extractedData; + } + + /** + * Gets all the decoded text data with it internal representation from a page. + * + * @param array $extractedRawData the extracted data return by extractRawData or + * null if extractRawData should be called + * + * @return array An array with the data and the internal representation + */ + public function extractDecodedRawData(?array $extractedRawData = null): array + { + if (!isset($extractedRawData) || !$extractedRawData) { + $extractedRawData = $this->extractRawData(); + } + $currentFont = null; /** @var Font $currentFont */ + $clippedFont = null; + $fpdfPage = null; + if ($this->isFpdf()) { + $fpdfPage = $this->createPageForFpdf(); + } + foreach ($extractedRawData as &$command) { + if ('Tj' == $command['o'] || 'TJ' == $command['o']) { + $data = $command['c']; + if (!\is_array($data)) { + $tmpText = ''; + if (isset($currentFont)) { + $tmpText = $currentFont->decodeOctal($data); + // $tmpText = $currentFont->decodeHexadecimal($tmpText, false); + } + $tmpText = str_replace( + ['\\\\', '\(', '\)', '\n', '\r', '\t', '\ '], + ['\\', '(', ')', "\n", "\r", "\t", ' '], + $tmpText + ); + $tmpText = mb_convert_encoding($tmpText, 'UTF-8', 'ISO-8859-1'); + if (isset($currentFont)) { + $tmpText = $currentFont->decodeContent($tmpText); + } + $command['c'] = $tmpText; + continue; + } + $numText = \count($data); + for ($i = 0; $i < $numText; ++$i) { + if (0 != ($i % 2)) { + continue; + } + $tmpText = $data[$i]['c']; + $decodedText = isset($currentFont) ? $currentFont->decodeOctal($tmpText) : $tmpText; + $decodedText = str_replace( + ['\\\\', '\(', '\)', '\n', '\r', '\t', '\ '], + ['\\', '(', ')', "\n", "\r", "\t", ' '], + $decodedText + ); + + $decodedText = mb_convert_encoding($decodedText, 'UTF-8', 'ISO-8859-1'); + + if (isset($currentFont)) { + $decodedText = $currentFont->decodeContent($decodedText); + } + $command['c'][$i]['c'] = $decodedText; + continue; + } + } elseif ('Tf' == $command['o'] || 'TF' == $command['o']) { + $fontId = explode(' ', $command['c'])[0]; + // If document is a FPDI/FPDF the $page has the correct font + $currentFont = isset($fpdfPage) ? $fpdfPage->getFont($fontId) : $this->getFont($fontId); + continue; + } elseif ('Q' == $command['o']) { + $currentFont = $clippedFont; + } elseif ('q' == $command['o']) { + $clippedFont = $currentFont; + } + } + + return $extractedRawData; + } + + /** + * Gets just the Text commands that are involved in text positions and + * Text Matrix (Tm) + * + * It extract just the PDF commands that are involved with text positions, and + * the Text Matrix (Tm). These are: BT, ET, TL, Td, TD, Tm, T*, Tj, ', ", and TJ + * + * @param array $extractedDecodedRawData The data extracted by extractDecodeRawData. + * If it is null, the method extractDecodeRawData is called. + * + * @return array An array with the text command of the page + */ + public function getDataCommands(?array $extractedDecodedRawData = null): array + { + if (!isset($extractedDecodedRawData) || !$extractedDecodedRawData) { + $extractedDecodedRawData = $this->extractDecodedRawData(); + } + $extractedData = []; + foreach ($extractedDecodedRawData as $command) { + switch ($command['o']) { + /* + * BT + * Begin a text object, inicializind the Tm and Tlm to identity matrix + */ + case 'BT': + $extractedData[] = $command; + break; + /* + * cm + * Concatenation Matrix that will transform all following Tm + */ + case 'cm': + $extractedData[] = $command; + break; + /* + * ET + * End a text object, discarding the text matrix + */ + case 'ET': + $extractedData[] = $command; + break; + + /* + * leading TL + * Set the text leading, Tl, to leading. Tl is used by the T*, ' and " operators. + * Initial value: 0 + */ + case 'TL': + $extractedData[] = $command; + break; + + /* + * tx ty Td + * Move to the start of the next line, offset form the start of the + * current line by tx, ty. + */ + case 'Td': + $extractedData[] = $command; + break; + + /* + * tx ty TD + * Move to the start of the next line, offset form the start of the + * current line by tx, ty. As a side effect, this operator set the leading + * parameter in the text state. This operator has the same effect as the + * code: + * -ty TL + * tx ty Td + */ + case 'TD': + $extractedData[] = $command; + break; + + /* + * a b c d e f Tm + * Set the text matrix, Tm, and the text line matrix, Tlm. The operands are + * all numbers, and the initial value for Tm and Tlm is the identity matrix + * [1 0 0 1 0 0] + */ + case 'Tm': + $extractedData[] = $command; + break; + + /* + * T* + * Move to the start of the next line. This operator has the same effect + * as the code: + * 0 Tl Td + * Where Tl is the current leading parameter in the text state. + */ + case 'T*': + $extractedData[] = $command; + break; + + /* + * string Tj + * Show a Text String + */ + case 'Tj': + $extractedData[] = $command; + break; + + /* + * string ' + * Move to the next line and show a text string. This operator has the + * same effect as the code: + * T* + * string Tj + */ + case "'": + $extractedData[] = $command; + break; + + /* + * aw ac string " + * Move to the next lkine and show a text string, using aw as the word + * spacing and ac as the character spacing. This operator has the same + * effect as the code: + * aw Tw + * ac Tc + * string ' + * Tw set the word spacing, Tw, to wordSpace. + * Tc Set the character spacing, Tc, to charsSpace. + */ + case '"': + $extractedData[] = $command; + break; + + case 'Tf': + case 'TF': + $extractedData[] = $command; + break; + + /* + * array TJ + * Show one or more text strings allow individual glyph positioning. + * Each lement of array con be a string or a number. If the element is + * a string, this operator shows the string. If it is a number, the + * operator adjust the text position by that amount; that is, it translates + * the text matrix, Tm. This amount is substracted form the current + * horizontal or vertical coordinate, depending on the writing mode. + * in the default coordinate system, a positive adjustment has the effect + * of moving the next glyph painted either to the left or down by the given + * amount. + */ + case 'TJ': + $extractedData[] = $command; + break; + /* + * q + * Save current graphics state to stack + */ + case 'q': + /* + * Q + * Load last saved graphics state from stack + */ + case 'Q': + $extractedData[] = $command; + break; + default: + } + } + + return $extractedData; + } + + /** + * Gets the Text Matrix of the text in the page + * + * Return an array where every item is an array where the first item is the + * Text Matrix (Tm) and the second is a string with the text data. The Text matrix + * is an array of 6 numbers. The last 2 numbers are the coordinates X and Y of the + * text. The first 4 numbers has to be with Scalation, Rotation and Skew of the text. + * + * @param array $dataCommands the data extracted by getDataCommands + * if null getDataCommands is called + * + * @return array an array with the data of the page including the Tm information + * of any text in the page + */ + public function getDataTm(?array $dataCommands = null): array + { + if (!isset($dataCommands) || !$dataCommands) { + $dataCommands = $this->getDataCommands(); + } + + /* + * At the beginning of a text object Tm is the identity matrix + */ + $defaultTm = ['1', '0', '0', '1', '0', '0']; + $concatTm = ['1', '0', '0', '1', '0', '0']; + $graphicsStatesStack = []; + /* + * Set the text leading used by T*, ' and " operators + */ + $defaultTl = 0; + + /* + * Set default values for font data + */ + $defaultFontId = -1; + $defaultFontSize = 1; + + /* + * Indexes of horizontal/vertical scaling and X,Y-coordinates in the matrix (Tm) + */ + $hSc = 0; // horizontal scaling + /** + * index of vertical scaling in the array that encodes the text matrix. + * for more information: https://github.com/smalot/pdfparser/pull/559#discussion_r1053415500 + */ + $vSc = 3; + $x = 4; + $y = 5; + + /* + * x,y-coordinates of text space origin in user units + * + * These will be assigned the value of the currently printed string + */ + $Tx = 0; + $Ty = 0; + + $Tm = $defaultTm; + $Tl = $defaultTl; + $fontId = $defaultFontId; + $fontSize = $defaultFontSize; // reflects fontSize set by Tf or Tfs + + $extractedTexts = $this->getTextArray(); + $extractedData = []; + foreach ($dataCommands as $command) { + // If we've used up all the texts from getTextArray(), exit + // so we aren't accessing non-existent array indices + // Fixes 'undefined array key' errors in Issues #575, #576 + if (\count($extractedTexts) <= \count($extractedData)) { + break; + } + $currentText = $extractedTexts[\count($extractedData)]; + switch ($command['o']) { + /* + * BT + * Begin a text object, initializing the Tm and Tlm to identity matrix + */ + case 'BT': + $Tm = $defaultTm; + $Tl = $defaultTl; + $Tx = 0; + $Ty = 0; + break; + + case 'cm': + $newConcatTm = (array) explode(' ', $command['c']); + $TempMatrix = []; + // Multiply with previous concatTm + $TempMatrix[0] = (float) $concatTm[0] * (float) $newConcatTm[0] + (float) $concatTm[1] * (float) $newConcatTm[2]; + $TempMatrix[1] = (float) $concatTm[0] * (float) $newConcatTm[1] + (float) $concatTm[1] * (float) $newConcatTm[3]; + $TempMatrix[2] = (float) $concatTm[2] * (float) $newConcatTm[0] + (float) $concatTm[3] * (float) $newConcatTm[2]; + $TempMatrix[3] = (float) $concatTm[2] * (float) $newConcatTm[1] + (float) $concatTm[3] * (float) $newConcatTm[3]; + $TempMatrix[4] = (float) $concatTm[4] * (float) $newConcatTm[0] + (float) $concatTm[5] * (float) $newConcatTm[2] + (float) $newConcatTm[4]; + $TempMatrix[5] = (float) $concatTm[4] * (float) $newConcatTm[1] + (float) $concatTm[5] * (float) $newConcatTm[3] + (float) $newConcatTm[5]; + $concatTm = $TempMatrix; + break; + /* + * ET + * End a text object + */ + case 'ET': + break; + + /* + * text leading TL + * Set the text leading, Tl, to leading. Tl is used by the T*, ' and " operators. + * Initial value: 0 + */ + case 'TL': + // scaled text leading + $Tl = (float) $command['c'] * (float) $Tm[$vSc]; + break; + + /* + * tx ty Td + * Move to the start of the next line, offset from the start of the + * current line by tx, ty. + */ + case 'Td': + $coord = explode(' ', $command['c']); + $Tx += (float) $coord[0] * (float) $Tm[$hSc]; + $Ty += (float) $coord[1] * (float) $Tm[$vSc]; + $Tm[$x] = (string) $Tx; + $Tm[$y] = (string) $Ty; + break; + + /* + * tx ty TD + * Move to the start of the next line, offset form the start of the + * current line by tx, ty. As a side effect, this operator set the leading + * parameter in the text state. This operator has the same effect as the + * code: + * -ty TL + * tx ty Td + */ + case 'TD': + $coord = explode(' ', $command['c']); + $Tl = -((float) $coord[1] * (float) $Tm[$vSc]); + $Tx += (float) $coord[0] * (float) $Tm[$hSc]; + $Ty += (float) $coord[1] * (float) $Tm[$vSc]; + $Tm[$x] = (string) $Tx; + $Tm[$y] = (string) $Ty; + break; + + /* + * a b c d e f Tm + * Set the text matrix, Tm, and the text line matrix, Tlm. The operands are + * all numbers, and the initial value for Tm and Tlm is the identity matrix + * [1 0 0 1 0 0] + */ + case 'Tm': + $Tm = explode(' ', $command['c']); + $TempMatrix = []; + $TempMatrix[0] = (float) $Tm[0] * (float) $concatTm[0] + (float) $Tm[1] * (float) $concatTm[2]; + $TempMatrix[1] = (float) $Tm[0] * (float) $concatTm[1] + (float) $Tm[1] * (float) $concatTm[3]; + $TempMatrix[2] = (float) $Tm[2] * (float) $concatTm[0] + (float) $Tm[3] * (float) $concatTm[2]; + $TempMatrix[3] = (float) $Tm[2] * (float) $concatTm[1] + (float) $Tm[3] * (float) $concatTm[3]; + $TempMatrix[4] = (float) $Tm[4] * (float) $concatTm[0] + (float) $Tm[5] * (float) $concatTm[2] + (float) $concatTm[4]; + $TempMatrix[5] = (float) $Tm[4] * (float) $concatTm[1] + (float) $Tm[5] * (float) $concatTm[3] + (float) $concatTm[5]; + $Tm = $TempMatrix; + $Tx = (float) $Tm[$x]; + $Ty = (float) $Tm[$y]; + break; + + /* + * T* + * Move to the start of the next line. This operator has the same effect + * as the code: + * 0 Tl Td + * Where Tl is the current leading parameter in the text state. + */ + case 'T*': + $Ty -= $Tl; + $Tm[$y] = (string) $Ty; + break; + + /* + * string Tj + * Show a Text String + */ + case 'Tj': + $data = [$Tm, $currentText]; + if ($this->config->getDataTmFontInfoHasToBeIncluded()) { + $data[] = $fontId; + $data[] = $fontSize; + } + $extractedData[] = $data; + break; + + /* + * string ' + * Move to the next line and show a text string. This operator has the + * same effect as the code: + * T* + * string Tj + */ + case "'": + $Ty -= $Tl; + $Tm[$y] = (string) $Ty; + $extractedData[] = [$Tm, $currentText]; + break; + + /* + * aw ac string " + * Move to the next line and show a text string, using aw as the word + * spacing and ac as the character spacing. This operator has the same + * effect as the code: + * aw Tw + * ac Tc + * string ' + * Tw set the word spacing, Tw, to wordSpace. + * Tc Set the character spacing, Tc, to charsSpace. + */ + case '"': + $data = explode(' ', $currentText); + $Ty -= $Tl; + $Tm[$y] = (string) $Ty; + $extractedData[] = [$Tm, $data[2]]; // Verify + break; + + case 'Tf': + /* + * From PDF 1.0 specification, page 106: + * fontname size Tf Set font and size + * Sets the text font and text size in the graphics state. There is no default value for + * either fontname or size; they must be selected using Tf before drawing any text. + * fontname is a resource name. size is a number expressed in text space units. + * + * Source: https://ia902503.us.archive.org/10/items/pdfy-0vt8s-egqFwDl7L2/PDF%20Reference%201.0.pdf + * Introduced with https://github.com/smalot/pdfparser/pull/516 + */ + list($fontId, $fontSize) = explode(' ', $command['c'], 2); + break; + + /* + * array TJ + * Show one or more text strings allow individual glyph positioning. + * Each lement of array con be a string or a number. If the element is + * a string, this operator shows the string. If it is a number, the + * operator adjust the text position by that amount; that is, it translates + * the text matrix, Tm. This amount is substracted form the current + * horizontal or vertical coordinate, depending on the writing mode. + * in the default coordinate system, a positive adjustment has the effect + * of moving the next glyph painted either to the left or down by the given + * amount. + */ + case 'TJ': + $data = [$Tm, $currentText]; + if ($this->config->getDataTmFontInfoHasToBeIncluded()) { + $data[] = $fontId; + $data[] = $fontSize; + } + $extractedData[] = $data; + break; + /* + * q + * Save current graphics state to stack + */ + case 'q': + $graphicsStatesStack[] = $concatTm; + break; + /* + * Q + * Load last saved graphics state from stack + */ + case 'Q': + $concatTm = array_pop($graphicsStatesStack); + break; + default: + } + } + $this->dataTm = $extractedData; + + return $extractedData; + } + + /** + * Gets text data that are around the given coordinates (X,Y) + * + * If the text is in near the given coordinates (X,Y) (or the TM info), + * the text is returned. The extractedData return by getDataTm, could be use to see + * where is the coordinates of a given text, using the TM info for it. + * + * @param float $x The X value of the coordinate to search for. if null + * just the Y value is considered (same Row) + * @param float $y The Y value of the coordinate to search for + * just the X value is considered (same column) + * @param float $xError The value less or more to consider an X to be "near" + * @param float $yError The value less or more to consider an Y to be "near" + * + * @return array An array of text that are near the given coordinates. If no text + * "near" the x,y coordinate, an empty array is returned. If Both, x + * and y coordinates are null, null is returned. + */ + public function getTextXY(?float $x = null, ?float $y = null, float $xError = 0, float $yError = 0): array + { + if (!isset($this->dataTm) || !$this->dataTm) { + $this->getDataTm(); + } + + if (null !== $x) { + $x = (float) $x; + } + + if (null !== $y) { + $y = (float) $y; + } + + if (null === $x && null === $y) { + return []; + } + + $xError = (float) $xError; + $yError = (float) $yError; + + $extractedData = []; + foreach ($this->dataTm as $item) { + $tm = $item[0]; + $xTm = (float) $tm[4]; + $yTm = (float) $tm[5]; + $text = $item[1]; + if (null === $y) { + if (($xTm >= ($x - $xError)) + && ($xTm <= ($x + $xError))) { + $extractedData[] = [$tm, $text]; + continue; + } + } + if (null === $x) { + if (($yTm >= ($y - $yError)) + && ($yTm <= ($y + $yError))) { + $extractedData[] = [$tm, $text]; + continue; + } + } + if (($xTm >= ($x - $xError)) + && ($xTm <= ($x + $xError)) + && ($yTm >= ($y - $yError)) + && ($yTm <= ($y + $yError))) { + $extractedData[] = [$tm, $text]; + continue; + } + } + + return $extractedData; + } } diff --git a/src/Smalot/PdfParser/Pages.php b/src/Smalot/PdfParser/Pages.php index 958ca107..f95134b1 100644 --- a/src/Smalot/PdfParser/Pages.php +++ b/src/Smalot/PdfParser/Pages.php @@ -5,66 +5,127 @@ * This file is part of the PdfParser library. * * @author Sébastien MALOT - * @date 2013-08-08 - * @license GPL-3.0 + * + * @date 2017-01-03 + * + * @license LGPLv3 + * * @url * * PdfParser is a pdf library written in PHP, extraction oriented. - * Copyright (C) 2014 - Sébastien MALOT + * Copyright (C) 2017 - Sébastien MALOT * * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by + * it under the terms of the GNU Lesser General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. + * GNU Lesser General Public License for more details. * - * You should have received a copy of the GNU General Public License + * You should have received a copy of the GNU Lesser General Public License * along with this program. * If not, see . - * */ namespace Smalot\PdfParser; +use Smalot\PdfParser\Element\ElementArray; + /** * Class Pages - * - * @package Smalot\PdfParser */ -class Pages extends Object +class Pages extends PDFObject { /** - * @param bool $deep + * @var array<\Smalot\PdfParser\Font>|null + */ + protected $fonts; + + /** + * @todo Objects other than Pages or Page might need to be treated specifically + * in order to get Page objects out of them. * - * @return array + * @see https://github.com/smalot/pdfparser/issues/331 */ - public function getPages($deep = false) + public function getPages(bool $deep = false): array { - if ($this->has('Kids')) { + if (!$this->has('Kids')) { + return []; + } + + /** @var ElementArray $kidsElement */ + $kidsElement = $this->get('Kids'); - if (!$deep) { - return $this->get('Kids')->getContent(); + if (!$deep) { + return $kidsElement->getContent(); + } + + // Prepare to apply the Pages' object's fonts to each page + if (false === \is_array($this->fonts)) { + $this->setupFonts(); + } + $fontsAvailable = 0 < \count($this->fonts); + + $kids = $kidsElement->getContent(); + $pages = []; + + foreach ($kids as $kid) { + if ($kid instanceof self) { + $pages = array_merge($pages, $kid->getPages(true)); + } elseif ($kid instanceof Page) { + if ($fontsAvailable) { + $kid->setFonts($this->fonts); + } + $pages[] = $kid; + } + } + + return $pages; + } + + /** + * Gathers information about fonts and collects them in a list. + * + * @return void + * + * @internal + */ + protected function setupFonts() + { + $resources = $this->get('Resources'); + + if (method_exists($resources, 'has') && $resources->has('Font')) { + // no fonts available, therefore stop here + if ($resources->get('Font') instanceof Element\ElementMissing) { + return; + } + + if ($resources->get('Font') instanceof Header) { + $fonts = $resources->get('Font')->getElements(); } else { - $kids = $this->get('Kids')->getContent(); - $pages = array(); + $fonts = $resources->get('Font')->getHeader()->getElements(); + } - foreach ($kids as $kid) { + $table = []; - if ($kid instanceof Pages) { - $pages = array_merge($pages, $kid->getPages(true)); - } else { - $pages[] = $kid; + foreach ($fonts as $id => $font) { + if ($font instanceof Font) { + $table[$id] = $font; + + // Store too on cleaned id value (only numeric) + $id = preg_replace('/[^0-9\.\-_]/', '', $id); + if ('' != $id) { + $table[$id] = $font; } } - - return $pages; } - } - return array(); + $this->fonts = $table; + } else { + $this->fonts = []; + } } } diff --git a/src/Smalot/PdfParser/Parser.php b/src/Smalot/PdfParser/Parser.php index 5d36307d..b051f114 100644 --- a/src/Smalot/PdfParser/Parser.php +++ b/src/Smalot/PdfParser/Parser.php @@ -5,27 +5,29 @@ * This file is part of the PdfParser library. * * @author Sébastien MALOT - * @date 2013-08-08 - * @license GPL-3.0 + * + * @date 2017-01-03 + * + * @license LGPLv3 + * * @url * * PdfParser is a pdf library written in PHP, extraction oriented. - * Copyright (C) 2014 - Sébastien MALOT + * Copyright (C) 2017 - Sébastien MALOT * * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by + * it under the terms of the GNU Lesser General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. + * GNU Lesser General Public License for more details. * - * You should have received a copy of the GNU General Public License + * You should have received a copy of the GNU Lesser General Public License * along with this program. * If not, see . - * */ namespace Smalot\PdfParser; @@ -39,64 +41,82 @@ use Smalot\PdfParser\Element\ElementNumeric; use Smalot\PdfParser\Element\ElementString; use Smalot\PdfParser\Element\ElementXRef; +use Smalot\PdfParser\RawData\RawDataParser; /** * Class Parser - * - * @package Smalot\PdfParser */ class Parser { /** - * @var Object[] + * @var Config */ - protected $objects = array(); + private $config; /** - * + * @var PDFObject[] */ - public function __construct() + protected $objects = []; + + protected $rawDataParser; + + public function __construct($cfg = [], ?Config $config = null) { + $this->config = $config ?: new Config(); + $this->rawDataParser = new RawDataParser($cfg, $this->config); + } + public function getConfig(): Config + { + return $this->config; } /** - * Parse PDF file - * - * @param string $filename - * - * @return Document + * @throws \Exception */ - public function parseFile($filename) + public function parseFile(string $filename): Document { $content = file_get_contents($filename); + /* + * 2018/06/20 @doganoo as multiple times a + * users have complained that the parseFile() + * method dies silently, it is an better option + * to remove the error control operator (@) and + * let the users know that the method throws an exception + * by adding @throws tag to PHPDoc. + * + * See here for an example: https://github.com/smalot/pdfparser/issues/204 + */ return $this->parseContent($content); } /** - * Parse PDF content - * - * @param string $content + * @param string $content PDF content to parse * - * @return Document + * @throws \Exception if secured PDF file was detected + * @throws \Exception if no object list was found */ - public function parseContent($content) + public function parseContent(string $content): Document { - // Create structure using TCPDF Parser. - $parser = new \TCPDF_PARSER($content); - list($xref, $data) = $parser->getParsedData(); + // Create structure from raw data. + list($xref, $data) = $this->rawDataParser->parseData($content); - if (isset($xref['trailer']['encrypt'])) { + if (isset($xref['trailer']['encrypt']) && false === $this->config->getIgnoreEncryption()) { throw new \Exception('Secured pdf file are currently not supported.'); } + if (empty($data)) { + throw new \Exception('Object list not found. Possible secured file.'); + } + // Create destination object. - $document = new Document(); - $this->objects = array(); + $document = new Document(); + $this->objects = []; foreach ($data as $id => $structure) { $this->parseObject($id, $structure, $document); + unset($data[$id]); } $document->setTrailer($this->parseTrailer($xref['trailer'], $document)); @@ -105,19 +125,19 @@ public function parseContent($content) return $document; } - protected function parseTrailer($structure, $document) + protected function parseTrailer(array $structure, ?Document $document) { - $trailer = array(); + $trailer = []; foreach ($structure as $name => $values) { $name = ucfirst($name); if (is_numeric($values)) { - $trailer[$name] = new ElementNumeric($values, $document); - } elseif (is_array($values)) { + $trailer[$name] = new ElementNumeric($values); + } elseif (\is_array($values)) { $value = $this->parseTrailer($values, null); $trailer[$name] = new ElementArray($value, null); - } elseif (strpos($values, '_') !== false) { + } elseif (false !== strpos($values, '_')) { $trailer[$name] = new ElementXRef($values, $document); } else { $trailer[$name] = $this->parseHeaderElement('(', $values, $document); @@ -127,24 +147,22 @@ protected function parseTrailer($structure, $document) return new Header($trailer, $document); } - /** - * @param string $id - * @param array $structure - * @param Document $document - */ - protected function parseObject($id, $structure, $document) + protected function parseObject(string $id, array $structure, ?Document $document) { - $header = new Header(array(), $document); + $header = new Header([], $document); $content = ''; foreach ($structure as $position => $part) { + if (\is_int($part)) { + $part = [null, null]; + } switch ($part[0]) { case '[': - $elements = array(); + $elements = []; foreach ($part[1] as $sub_element) { - $sub_type = $sub_element[0]; - $sub_value = $sub_element[1]; + $sub_type = $sub_element[0]; + $sub_value = $sub_element[1]; $elements[] = $this->parseHeaderElement($sub_type, $sub_value, $document); } @@ -159,81 +177,78 @@ protected function parseObject($id, $structure, $document) $content = isset($part[3][0]) ? $part[3][0] : $part[1]; if ($header->get('Type')->equals('ObjStm')) { - $match = array(); + $match = []; // Split xrefs and contents. - preg_match('/^([\d\s]+)(.*)$/s', $content, $match); - $content = $match[2]; + preg_match('/^((\d+\s+\d+\s*)*)(.*)$/s', $content, $match); + $content = $match[3]; // Extract xrefs. $xrefs = preg_split( '/(\d+\s+\d+\s*)/s', $match[1], -1, - PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE + \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE ); - $table = array(); + $table = []; foreach ($xrefs as $xref) { - list($id, $position) = explode(' ', trim($xref)); + list($id, $position) = preg_split("/\s+/", trim($xref)); $table[$position] = $id; } ksort($table); - $ids = array_values($table); + $ids = array_values($table); $positions = array_keys($table); foreach ($positions as $index => $position) { - $id = $ids[$index] . '_0'; - $next_position = isset($positions[$index + 1]) ? $positions[$index + 1] : strlen($content); - $sub_content = substr($content, $position, $next_position - $position); + $id = $ids[$index].'_0'; + $next_position = isset($positions[$index + 1]) ? $positions[$index + 1] : \strlen($content); + $sub_content = substr($content, $position, (int) $next_position - (int) $position); - $sub_header = Header::parse($sub_content, $document); - $object = Object::factory($document, $sub_header, ''); + $sub_header = Header::parse($sub_content, $document); + $object = PDFObject::factory($document, $sub_header, '', $this->config); $this->objects[$id] = $object; } // It is not necessary to store this content. - $content = ''; return; + } elseif ($header->get('Type')->equals('Metadata')) { + // Attempt to parse XMP XML Metadata + $document->extractXMPMetadata($content); } break; default: - if ($part != 'null') { + if ('null' != $part) { $element = $this->parseHeaderElement($part[0], $part[1], $document); if ($element) { - $header = new Header(array($element), $document); + $header = new Header([$element], $document); } } break; - } } if (!isset($this->objects[$id])) { - $this->objects[$id] = Object::factory($document, $header, $content); + $this->objects[$id] = PDFObject::factory($document, $header, $content, $this->config); } } /** - * @param array $structure - * @param Document $document - * - * @return Header * @throws \Exception */ - protected function parseHeader($structure, $document) + protected function parseHeader(array $structure, ?Document $document): Header { - $elements = array(); - $count = count($structure); + $elements = []; + $count = \count($structure); for ($position = 0; $position < $count; $position += 2) { - $name = $structure[$position][1]; - $type = $structure[$position + 1][0]; + $name = $structure[$position][1]; + $type = $structure[$position + 1][0]; $value = $structure[$position + 1][1]; $elements[$name] = $this->parseHeaderElement($type, $value, $document); @@ -243,63 +258,74 @@ protected function parseHeader($structure, $document) } /** - * @param $type - * @param $value - * @param $document + * @param string|array $value + * + * @return Element|Header|null * - * @return Element|Header * @throws \Exception */ - protected function parseHeaderElement($type, $value, $document) + protected function parseHeaderElement(?string $type, $value, ?Document $document) { + $valueIsEmpty = null == $value || '' == $value || false == $value; + if (('<<' === $type || '>>' === $type) && $valueIsEmpty) { + $value = []; + } + switch ($type) { case '<<': - return $this->parseHeader($value, $document); + case '>>': + $header = $this->parseHeader($value, $document); + PDFObject::factory($document, $header, null, $this->config); + + return $header; case 'numeric': - return new ElementNumeric($value, $document); + return new ElementNumeric($value); case 'boolean': - return new ElementBoolean($value, $document); + return new ElementBoolean($value); case 'null': - return new ElementNull($value, $document); + return new ElementNull(); case '(': - if ($date = ElementDate::parse('(' . $value . ')', $document)) { + if ($date = ElementDate::parse('('.$value.')', $document)) { return $date; - } else { - return ElementString::parse('(' . $value . ')', $document); } + return ElementString::parse('('.$value.')', $document); + case '<': - return $this->parseHeaderElement('(', ElementHexa::decode($value, $document), $document); + return $this->parseHeaderElement('(', ElementHexa::decode($value), $document); case '/': - return ElementName::parse('/' . $value, $document); + return ElementName::parse('/'.$value, $document); case 'ojbref': // old mistake in tcpdf parser case 'objref': return new ElementXRef($value, $document); case '[': - $values = array(); + $values = []; - foreach ($value as $sub_element) { - $sub_type = $sub_element[0]; - $sub_value = $sub_element[1]; - $values[] = $this->parseHeaderElement($sub_type, $sub_value, $document); + if (\is_array($value)) { + foreach ($value as $sub_element) { + $sub_type = $sub_element[0]; + $sub_value = $sub_element[1]; + $values[] = $this->parseHeaderElement($sub_type, $sub_value, $document); + } } return new ElementArray($values, $document); case 'endstream': + case 'obj': // I don't know what it means but got my project fixed. case '': // Nothing to do with. - break; + return null; default: - throw new \Exception('Invalid type: "' . $type . '".'); + throw new \Exception('Invalid type: "'.$type.'".'); } } } diff --git a/src/Smalot/PdfParser/RawData/FilterHelper.php b/src/Smalot/PdfParser/RawData/FilterHelper.php new file mode 100644 index 00000000..617936d9 --- /dev/null +++ b/src/Smalot/PdfParser/RawData/FilterHelper.php @@ -0,0 +1,427 @@ + + * + * @date 2020-01-06 + * + * @license LGPLv3 + * + * @url + * + * PdfParser is a pdf library written in PHP, extraction oriented. + * Copyright (C) 2017 - Sébastien MALOT + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program. + * If not, see . + */ + +namespace Smalot\PdfParser\RawData; + +use Smalot\PdfParser\Exception\NotImplementedException; + +class FilterHelper +{ + protected $availableFilters = ['ASCIIHexDecode', 'ASCII85Decode', 'LZWDecode', 'FlateDecode', 'RunLengthDecode']; + + /** + * Decode data using the specified filter type. + * + * @param string $filter Filter name + * @param string $data Data to decode + * + * @return string Decoded data string + * + * @throws \Exception + * @throws \Smalot\PdfParser\Exception\NotImplementedException if a certain decode function is not implemented yet + */ + public function decodeFilter(string $filter, string $data, int $decodeMemoryLimit = 0): string + { + switch ($filter) { + case 'ASCIIHexDecode': + return $this->decodeFilterASCIIHexDecode($data); + + case 'ASCII85Decode': + return $this->decodeFilterASCII85Decode($data); + + case 'LZWDecode': + return $this->decodeFilterLZWDecode($data); + + case 'FlateDecode': + return $this->decodeFilterFlateDecode($data, $decodeMemoryLimit); + + case 'RunLengthDecode': + return $this->decodeFilterRunLengthDecode($data); + + case 'CCITTFaxDecode': + throw new NotImplementedException('Decode CCITTFaxDecode not implemented yet.'); + case 'JBIG2Decode': + throw new NotImplementedException('Decode JBIG2Decode not implemented yet.'); + case 'DCTDecode': + throw new NotImplementedException('Decode DCTDecode not implemented yet.'); + case 'JPXDecode': + throw new NotImplementedException('Decode JPXDecode not implemented yet.'); + case 'Crypt': + throw new NotImplementedException('Decode Crypt not implemented yet.'); + default: + return $data; + } + } + + /** + * ASCIIHexDecode + * + * Decodes data encoded in an ASCII hexadecimal representation, reproducing the original binary data. + * + * @param string $data Data to decode + * + * @return string data string + * + * @throws \Exception + */ + protected function decodeFilterASCIIHexDecode(string $data): string + { + // all white-space characters shall be ignored + $data = preg_replace('/[\s]/', '', $data); + // check for EOD character: GREATER-THAN SIGN (3Eh) + $eod = strpos($data, '>'); + if (false !== $eod) { + // remove EOD and extra data (if any) + $data = substr($data, 0, $eod); + $eod = true; + } + // get data length + $data_length = \strlen($data); + if (0 != ($data_length % 2)) { + // odd number of hexadecimal digits + if ($eod) { + // EOD shall behave as if a 0 (zero) followed the last digit + $data = substr($data, 0, -1).'0'.substr($data, -1); + } else { + throw new \Exception('decodeFilterASCIIHexDecode: invalid code'); + } + } + // check for invalid characters + if (preg_match('/[^a-fA-F\d]/', $data) > 0) { + throw new \Exception('decodeFilterASCIIHexDecode: invalid code'); + } + // get one byte of binary data for each pair of ASCII hexadecimal digits + $decoded = pack('H*', $data); + + return $decoded; + } + + /** + * ASCII85Decode + * + * Decodes data encoded in an ASCII base-85 representation, reproducing the original binary data. + * + * @param string $data Data to decode + * + * @return string data string + * + * @throws \Exception + */ + protected function decodeFilterASCII85Decode(string $data): string + { + // initialize string to return + $decoded = ''; + // all white-space characters shall be ignored + $data = preg_replace('/[\s]/', '', $data); + // remove start sequence 2-character sequence <~ (3Ch)(7Eh) + if (0 === strpos($data, '<~')) { + // remove EOD and extra data (if any) + $data = substr($data, 2); + } + // check for EOD: 2-character sequence ~> (7Eh)(3Eh) + $eod = strpos($data, '~>'); + if (\strlen($data) - 2 === $eod) { + // remove EOD and extra data (if any) + $data = substr($data, 0, $eod); + } + // data length + $data_length = \strlen($data); + // check for invalid characters + if (preg_match('/[^\x21-\x75,\x74]/', $data) > 0) { + throw new \Exception('decodeFilterASCII85Decode: invalid code'); + } + // z sequence + $zseq = \chr(0).\chr(0).\chr(0).\chr(0); + // position inside a group of 4 bytes (0-3) + $group_pos = 0; + $tuple = 0; + $pow85 = [85 * 85 * 85 * 85, 85 * 85 * 85, 85 * 85, 85, 1]; + + // for each byte + for ($i = 0; $i < $data_length; ++$i) { + // get char value + $char = \ord($data[$i]); + if (122 == $char) { // 'z' + if (0 == $group_pos) { + $decoded .= $zseq; + } else { + throw new \Exception('decodeFilterASCII85Decode: invalid code'); + } + } else { + // the value represented by a group of 5 characters should never be greater than 2^32 - 1 + $tuple += (($char - 33) * $pow85[$group_pos]); + if (4 == $group_pos) { + // The following if-clauses are an attempt to fix/suppress the following deprecation warning: + // chr(): Providing a value not in-between 0 and 255 is deprecated, this is because a byte value + // must be in the [0, 255] interval. The value used will be constrained using % 256 + // I know this is ugly and there might be more fancier ways. If you know one, feel free to provide a pull request. + if (255 < $tuple >> 8) { + $chr8Part = \chr(($tuple >> 8) % 256); + } else { + $chr8Part = \chr($tuple >> 8); + } + + if (255 < $tuple >> 16) { + $chr16Part = \chr(($tuple >> 16) % 256); + } else { + $chr16Part = \chr($tuple >> 16); + } + + if (255 < $tuple >> 24) { + $chr24Part = \chr(($tuple >> 24) % 256); + } else { + $chr24Part = \chr($tuple >> 24); + } + + if (255 < $tuple) { + $chrTuple = \chr($tuple % 256); + } else { + $chrTuple = \chr($tuple); + } + + $decoded .= $chr24Part . $chr16Part . $chr8Part . $chrTuple; + $tuple = 0; + $group_pos = 0; + } else { + ++$group_pos; + } + } + } + if ($group_pos > 1) { + $tuple += $pow85[$group_pos - 1]; + } + // last tuple (if any) + switch ($group_pos) { + case 4: + $decoded .= \chr($tuple >> 24).\chr($tuple >> 16).\chr($tuple >> 8); + break; + + case 3: + $decoded .= \chr($tuple >> 24).\chr($tuple >> 16); + break; + + case 2: + $decoded .= \chr($tuple >> 24); + break; + + case 1: + throw new \Exception('decodeFilterASCII85Decode: invalid code'); + } + + return $decoded; + } + + /** + * FlateDecode + * + * Decompresses data encoded using the zlib/deflate compression method, reproducing the original text or binary data. + * + * @param string $data Data to decode + * @param int $decodeMemoryLimit Memory limit on deflation + * + * @return string data string + * + * @throws \Exception + */ + protected function decodeFilterFlateDecode(string $data, int $decodeMemoryLimit): ?string + { + // Uncatchable E_WARNING for "data error" is @ suppressed + // so execution may proceed with an alternate decompression + // method. + $decoded = @gzuncompress($data, $decodeMemoryLimit); + + if (false === $decoded) { + // If gzuncompress() failed, try again using the compress.zlib:// + // wrapper to decode it in a file-based context. + // See: https://www.php.net/manual/en/function.gzuncompress.php#79042 + // Issue: https://github.com/smalot/pdfparser/issues/592 + $ztmp = tmpfile(); + if (false != $ztmp) { + fwrite($ztmp, "\x1f\x8b\x08\x00\x00\x00\x00\x00".$data); + $file = stream_get_meta_data($ztmp)['uri']; + if (0 === $decodeMemoryLimit) { + $decoded = file_get_contents('compress.zlib://'.$file); + } else { + $decoded = file_get_contents('compress.zlib://'.$file, false, null, 0, $decodeMemoryLimit); + } + fclose($ztmp); + } + } + + if (false === \is_string($decoded) || '' === $decoded) { + // If the decoded string is empty, that means decoding failed. + throw new \Exception('decodeFilterFlateDecode: invalid data'); + } + + return $decoded; + } + + /** + * LZWDecode + * + * Decompresses data encoded using the LZW (Lempel-Ziv-Welch) adaptive compression method, reproducing the original text or binary data. + * + * @param string $data Data to decode + * + * @return string Data string + */ + protected function decodeFilterLZWDecode(string $data): string + { + // initialize string to return + $decoded = ''; + // data length + $data_length = \strlen($data); + // convert string to binary string + $bitstring = ''; + for ($i = 0; $i < $data_length; ++$i) { + $bitstring .= \sprintf('%08b', \ord($data[$i])); + } + // get the number of bits + $data_length = \strlen($bitstring); + // initialize code length in bits + $bitlen = 9; + // initialize dictionary index + $dix = 258; + // initialize the dictionary (with the first 256 entries). + $dictionary = []; + for ($i = 0; $i < 256; ++$i) { + $dictionary[$i] = \chr($i); + } + // previous val + $prev_index = 0; + // while we encounter EOD marker (257), read code_length bits + while (($data_length > 0) && (257 != ($index = bindec(substr($bitstring, 0, $bitlen))))) { + // remove read bits from string + $bitstring = substr($bitstring, $bitlen); + // update number of bits + $data_length -= $bitlen; + if (256 == $index) { // clear-table marker + // reset code length in bits + $bitlen = 9; + // reset dictionary index + $dix = 258; + $prev_index = 256; + // reset the dictionary (with the first 256 entries). + $dictionary = []; + for ($i = 0; $i < 256; ++$i) { + $dictionary[$i] = \chr($i); + } + } elseif (256 == $prev_index) { + // first entry + $decoded .= $dictionary[$index]; + $prev_index = $index; + } else { + // check if index exist in the dictionary + if ($index < $dix) { + // index exist on dictionary + $decoded .= $dictionary[$index]; + $dic_val = $dictionary[$prev_index].$dictionary[$index][0]; + // store current index + $prev_index = $index; + } else { + // index do not exist on dictionary + $dic_val = $dictionary[$prev_index].$dictionary[$prev_index][0]; + $decoded .= $dic_val; + } + // update dictionary + $dictionary[$dix] = $dic_val; + ++$dix; + // change bit length by case + if (2047 == $dix) { + $bitlen = 12; + } elseif (1023 == $dix) { + $bitlen = 11; + } elseif (511 == $dix) { + $bitlen = 10; + } + } + } + + return $decoded; + } + + /** + * RunLengthDecode + * + * Decompresses data encoded using a byte-oriented run-length encoding algorithm. + * + * @param string $data Data to decode + */ + protected function decodeFilterRunLengthDecode(string $data): string + { + // initialize string to return + $decoded = ''; + // data length + $data_length = \strlen($data); + $i = 0; + while ($i < $data_length) { + // get current byte value + $byte = \ord($data[$i]); + if (128 == $byte) { + // a length value of 128 denote EOD + break; + } elseif ($byte < 128) { + // if the length byte is in the range 0 to 127 + // the following length + 1 (1 to 128) bytes shall be copied literally during decompression + $decoded .= substr($data, $i + 1, $byte + 1); + // move to next block + $i += ($byte + 2); + } else { + // if length is in the range 129 to 255, + // the following single byte shall be copied 257 - length (2 to 128) times during decompression + $decoded .= str_repeat($data[$i + 1], 257 - $byte); + // move to next block + $i += 2; + } + } + + return $decoded; + } + + /** + * @return array list of available filters + */ + public function getAvailableFilters(): array + { + return $this->availableFilters; + } +} diff --git a/src/Smalot/PdfParser/RawData/RawDataParser.php b/src/Smalot/PdfParser/RawData/RawDataParser.php new file mode 100644 index 00000000..ec8d01e5 --- /dev/null +++ b/src/Smalot/PdfParser/RawData/RawDataParser.php @@ -0,0 +1,990 @@ + + * + * @date 2020-01-06 + * + * @license LGPLv3 + * + * @url + * + * PdfParser is a pdf library written in PHP, extraction oriented. + * Copyright (C) 2017 - Sébastien MALOT + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program. + * If not, see . + */ + +namespace Smalot\PdfParser\RawData; + +use Smalot\PdfParser\Config; +use Smalot\PdfParser\Exception\EmptyPdfException; +use Smalot\PdfParser\Exception\MissingPdfHeaderException; + +class RawDataParser +{ + /** + * @var Config + */ + private $config; + + /** + * Configuration array. + * + * @var array + */ + protected $cfg = [ + // if `true` ignore filter decoding errors + 'ignore_filter_decoding_errors' => true, + // if `true` ignore missing filter decoding errors + 'ignore_missing_filter_decoders' => true, + ]; + + protected $filterHelper; + protected $objects; + + /** + * @param array $cfg Configuration array, default is [] + */ + public function __construct($cfg = [], ?Config $config = null) + { + // merge given array with default values + $this->cfg = array_merge($this->cfg, $cfg); + + $this->filterHelper = new FilterHelper(); + $this->config = $config ?: new Config(); + } + + /** + * Decode the specified stream. + * + * @param string $pdfData PDF data + * @param array $sdic Stream's dictionary array + * @param string $stream Stream to decode + * + * @return array containing decoded stream data and remaining filters + * + * @throws \Exception + */ + protected function decodeStream(string $pdfData, array $xref, array $sdic, string $stream): array + { + // get stream length and filters + $slength = \strlen($stream); + if ($slength <= 0) { + return ['', []]; + } + $filters = []; + foreach ($sdic as $k => $v) { + if ('/' == $v[0]) { + if (('Length' == $v[1]) && (isset($sdic[$k + 1])) && ('numeric' == $sdic[$k + 1][0])) { + // get declared stream length + $declength = (int) $sdic[$k + 1][1]; + if ($declength < $slength) { + $stream = substr($stream, 0, $declength); + $slength = $declength; + } + } elseif (('Filter' == $v[1]) && (isset($sdic[$k + 1]))) { + // resolve indirect object + $objval = $this->getObjectVal($pdfData, $xref, $sdic[$k + 1]); + if ('/' == $objval[0]) { + // single filter + $filters[] = $objval[1]; + } elseif ('[' == $objval[0]) { + // array of filters + foreach ($objval[1] as $flt) { + if ('/' == $flt[0]) { + $filters[] = $flt[1]; + } + } + } + } + } + } + + // decode the stream + $remaining_filters = []; + foreach ($filters as $filter) { + if (\in_array($filter, $this->filterHelper->getAvailableFilters(), true)) { + try { + $stream = $this->filterHelper->decodeFilter($filter, $stream, $this->config->getDecodeMemoryLimit()); + } catch (\Exception $e) { + $emsg = $e->getMessage(); + if ((('~' == $emsg[0]) && !$this->cfg['ignore_missing_filter_decoders']) + || (('~' != $emsg[0]) && !$this->cfg['ignore_filter_decoding_errors']) + ) { + throw new \Exception($e->getMessage()); + } + } + } else { + // add missing filter to array + $remaining_filters[] = $filter; + } + } + + return [$stream, $remaining_filters]; + } + + /** + * Decode the Cross-Reference section + * + * @param string $pdfData PDF data + * @param int $startxref Offset at which the xref section starts (position of the 'xref' keyword) + * @param array $xref Previous xref array (if any) + * @param array $visitedOffsets Array of visited offsets to prevent infinite loops + * + * @return array containing xref and trailer data + * + * @throws \Exception + */ + protected function decodeXref(string $pdfData, int $startxref, array $xref = [], array $visitedOffsets = []): array + { + $startxref += 4; // 4 is the length of the word 'xref' + // skip initial white space chars + $offset = $startxref + strspn($pdfData, $this->config->getPdfWhitespaces(), $startxref); + // initialize object number + $obj_num = 0; + // search for cross-reference entries or subsection + while (preg_match('/([0-9]+)[\x20]([0-9]+)[\x20]?([nf]?)(\r\n|[\x20]?[\r\n])/', $pdfData, $matches, \PREG_OFFSET_CAPTURE, $offset) > 0) { + if ($matches[0][1] != $offset) { + // we are on another section + break; + } + $offset += \strlen($matches[0][0]); + if ('n' == $matches[3][0]) { + // create unique object index: [object number]_[generation number] + $index = $obj_num.'_'.(int) $matches[2][0]; + // check if object already exist + if (!isset($xref['xref'][$index])) { + // store object offset position + $xref['xref'][$index] = (int) $matches[1][0]; + } + ++$obj_num; + } elseif ('f' == $matches[3][0]) { + ++$obj_num; + } else { + // object number (index) + $obj_num = (int) $matches[1][0]; + } + } + // get trailer data + if (preg_match('/trailer[\s]*<<(.*)>>/isU', $pdfData, $matches, \PREG_OFFSET_CAPTURE, $offset) > 0) { + $trailer_data = $matches[1][0]; + if (!isset($xref['trailer']) || empty($xref['trailer'])) { + // get only the last updated version + $xref['trailer'] = []; + // parse trailer_data + if (preg_match('/Size[\s]+([0-9]+)/i', $trailer_data, $matches) > 0) { + $xref['trailer']['size'] = (int) $matches[1]; + } + if (preg_match('/Root[\s]+([0-9]+)[\s]+([0-9]+)[\s]+R/i', $trailer_data, $matches) > 0) { + $xref['trailer']['root'] = (int) $matches[1].'_'.(int) $matches[2]; + } + if (preg_match('/Encrypt[\s]+([0-9]+)[\s]+([0-9]+)[\s]+R/i', $trailer_data, $matches) > 0) { + $xref['trailer']['encrypt'] = (int) $matches[1].'_'.(int) $matches[2]; + } + if (preg_match('/Info[\s]+([0-9]+)[\s]+([0-9]+)[\s]+R/i', $trailer_data, $matches) > 0) { + $xref['trailer']['info'] = (int) $matches[1].'_'.(int) $matches[2]; + } + if (preg_match('/ID[\s]*[\[][\s]*[<]([^>]*)[>][\s]*[<]([^>]*)[>]/i', $trailer_data, $matches) > 0) { + $xref['trailer']['id'] = []; + $xref['trailer']['id'][0] = $matches[1]; + $xref['trailer']['id'][1] = $matches[2]; + } + } + if (preg_match('/Prev[\s]+([0-9]+)/i', $trailer_data, $matches) > 0) { + $offset = (int) $matches[1]; + if (0 != $offset) { + // get previous xref + $xref = $this->getXrefData($pdfData, $offset, $xref, $visitedOffsets); + } + } + } else { + throw new \Exception('Unable to find trailer'); + } + + return $xref; + } + + /** + * Decode the Cross-Reference Stream section + * + * @param string $pdfData PDF data + * @param int $startxref Offset at which the xref section starts + * @param array $xref Previous xref array (if any) + * @param array $visitedOffsets Array of visited offsets to prevent infinite loops + * + * @return array containing xref and trailer data + * + * @throws \Exception if unknown PNG predictor detected + */ + protected function decodeXrefStream(string $pdfData, int $startxref, array $xref = [], array $visitedOffsets = []): array + { + // try to read Cross-Reference Stream + $xrefobj = $this->getRawObject($pdfData, $startxref); + $xrefcrs = $this->getIndirectObject($pdfData, $xref, $xrefobj[1], $startxref, true); + if (!isset($xref['trailer']) || empty($xref['trailer'])) { + // get only the last updated version + $xref['trailer'] = []; + $filltrailer = true; + } else { + $filltrailer = false; + } + if (!isset($xref['xref'])) { + $xref['xref'] = []; + } + $valid_crs = false; + $columns = 0; + $predictor = null; + $sarr = $xrefcrs[0][1]; + if (!\is_array($sarr)) { + $sarr = []; + } + + $wb = []; + + foreach ($sarr as $k => $v) { + if ( + ('/' == $v[0]) + && ('Type' == $v[1]) + && ( + isset($sarr[$k + 1]) + && '/' == $sarr[$k + 1][0] + && 'XRef' == $sarr[$k + 1][1] + ) + ) { + $valid_crs = true; + } elseif (('/' == $v[0]) && ('Index' == $v[1]) && (isset($sarr[$k + 1]))) { + // initialize list for: first object number in the subsection / number of objects + $index_blocks = []; + for ($m = 0; $m < \count($sarr[$k + 1][1]); $m += 2) { + $index_blocks[] = [$sarr[$k + 1][1][$m][1], $sarr[$k + 1][1][$m + 1][1]]; + } + } elseif (('/' == $v[0]) && ('Prev' == $v[1]) && (isset($sarr[$k + 1]) && ('numeric' == $sarr[$k + 1][0]))) { + // get previous xref offset + $prevxref = (int) $sarr[$k + 1][1]; + } elseif (('/' == $v[0]) && ('W' == $v[1]) && (isset($sarr[$k + 1]))) { + // number of bytes (in the decoded stream) of the corresponding field + $wb[0] = (int) $sarr[$k + 1][1][0][1]; + $wb[1] = (int) $sarr[$k + 1][1][1][1]; + $wb[2] = (int) $sarr[$k + 1][1][2][1]; + } elseif (('/' == $v[0]) && ('DecodeParms' == $v[1]) && (isset($sarr[$k + 1][1]))) { + $decpar = $sarr[$k + 1][1]; + foreach ($decpar as $kdc => $vdc) { + if ( + '/' == $vdc[0] + && 'Columns' == $vdc[1] + && ( + isset($decpar[$kdc + 1]) + && 'numeric' == $decpar[$kdc + 1][0] + ) + ) { + $columns = (int) $decpar[$kdc + 1][1]; + } elseif ( + '/' == $vdc[0] + && 'Predictor' == $vdc[1] + && ( + isset($decpar[$kdc + 1]) + && 'numeric' == $decpar[$kdc + 1][0] + ) + ) { + $predictor = (int) $decpar[$kdc + 1][1]; + } + } + } elseif ($filltrailer) { + if (('/' == $v[0]) && ('Size' == $v[1]) && (isset($sarr[$k + 1]) && ('numeric' == $sarr[$k + 1][0]))) { + $xref['trailer']['size'] = $sarr[$k + 1][1]; + } elseif (('/' == $v[0]) && ('Root' == $v[1]) && (isset($sarr[$k + 1]) && ('objref' == $sarr[$k + 1][0]))) { + $xref['trailer']['root'] = $sarr[$k + 1][1]; + } elseif (('/' == $v[0]) && ('Info' == $v[1]) && (isset($sarr[$k + 1]) && ('objref' == $sarr[$k + 1][0]))) { + $xref['trailer']['info'] = $sarr[$k + 1][1]; + } elseif (('/' == $v[0]) && ('Encrypt' == $v[1]) && (isset($sarr[$k + 1]) && ('objref' == $sarr[$k + 1][0]))) { + $xref['trailer']['encrypt'] = $sarr[$k + 1][1]; + } elseif (('/' == $v[0]) && ('ID' == $v[1]) && (isset($sarr[$k + 1]))) { + $xref['trailer']['id'] = []; + $xref['trailer']['id'][0] = $sarr[$k + 1][1][0][1]; + $xref['trailer']['id'][1] = $sarr[$k + 1][1][1][1]; + } + } + } + + // decode data + if ($valid_crs && isset($xrefcrs[1][3][0])) { + if (null !== $predictor) { + // number of bytes in a row + $rowlen = ($columns + 1); + // convert the stream into an array of integers + /** @var array */ + $sdata = unpack('C*', $xrefcrs[1][3][0]); + // TODO: Handle the case when unpack returns false + + // split the rows + $sdata = array_chunk($sdata, $rowlen); + + // initialize decoded array + $ddata = []; + // initialize first row with zeros + $prev_row = array_fill(0, $rowlen, 0); + // for each row apply PNG unpredictor + foreach ($sdata as $k => $row) { + // initialize new row + $ddata[$k] = []; + // get PNG predictor value + $predictor = (10 + $row[0]); + // for each byte on the row + for ($i = 1; $i <= $columns; ++$i) { + // new index + $j = ($i - 1); + $row_up = $prev_row[$j]; + if (1 == $i) { + $row_left = 0; + $row_upleft = 0; + } else { + $row_left = $row[$i - 1]; + $row_upleft = $prev_row[$j - 1]; + } + switch ($predictor) { + case 10: // PNG prediction (on encoding, PNG None on all rows) + $ddata[$k][$j] = $row[$i]; + break; + + case 11: // PNG prediction (on encoding, PNG Sub on all rows) + $ddata[$k][$j] = (($row[$i] + $row_left) & 0xFF); + break; + + case 12: // PNG prediction (on encoding, PNG Up on all rows) + $ddata[$k][$j] = (($row[$i] + $row_up) & 0xFF); + break; + + case 13: // PNG prediction (on encoding, PNG Average on all rows) + $ddata[$k][$j] = (($row[$i] + (($row_left + $row_up) / 2)) & 0xFF); + break; + + case 14: // PNG prediction (on encoding, PNG Paeth on all rows) + // initial estimate + $p = ($row_left + $row_up - $row_upleft); + // distances + $pa = abs($p - $row_left); + $pb = abs($p - $row_up); + $pc = abs($p - $row_upleft); + $pmin = min($pa, $pb, $pc); + // return minimum distance + switch ($pmin) { + case $pa: + $ddata[$k][$j] = (($row[$i] + $row_left) & 0xFF); + break; + + case $pb: + $ddata[$k][$j] = (($row[$i] + $row_up) & 0xFF); + break; + + case $pc: + $ddata[$k][$j] = (($row[$i] + $row_upleft) & 0xFF); + break; + } + break; + + default: // PNG prediction (on encoding, PNG optimum) + throw new \Exception('Unknown PNG predictor: '.$predictor); + } + } + $prev_row = $ddata[$k]; + } // end for each row + // complete decoding + } else { + // number of bytes in a row + $rowlen = array_sum($wb); + if (0 < $rowlen) { + // convert the stream into an array of integers + $sdata = unpack('C*', $xrefcrs[1][3][0]); + // split the rows + $ddata = array_chunk($sdata, $rowlen); + } else { + // if the row length is zero, $ddata should be an empty array as well + $ddata = []; + } + } + + $sdata = []; + + // for every row + foreach ($ddata as $k => $row) { + // initialize new row + $sdata[$k] = [0, 0, 0]; + if (0 == $wb[0]) { + // default type field + $sdata[$k][0] = 1; + } + $i = 0; // count bytes in the row + // for every column + for ($c = 0; $c < 3; ++$c) { + // for every byte on the column + for ($b = 0; $b < $wb[$c]; ++$b) { + if (isset($row[$i])) { + $sdata[$k][$c] += ($row[$i] << (($wb[$c] - 1 - $b) * 8)); + } + ++$i; + } + } + } + + // fill xref + if (isset($index_blocks)) { + // load the first object number of the first /Index entry + $obj_num = $index_blocks[0][0]; + } else { + $obj_num = 0; + } + foreach ($sdata as $k => $row) { + switch ($row[0]) { + case 0: // (f) linked list of free objects + break; + + case 1: // (n) objects that are in use but are not compressed + // create unique object index: [object number]_[generation number] + $index = $obj_num.'_'.$row[2]; + // check if object already exist + if (!isset($xref['xref'][$index])) { + // store object offset position + $xref['xref'][$index] = $row[1]; + } + break; + + case 2: // compressed objects + // $row[1] = object number of the object stream in which this object is stored + // $row[2] = index of this object within the object stream + $index = $row[1].'_0_'.$row[2]; + $xref['xref'][$index] = -1; + break; + + default: // null objects + break; + } + ++$obj_num; + if (isset($index_blocks)) { + // reduce the number of remaining objects + --$index_blocks[0][1]; + if (0 == $index_blocks[0][1]) { + // remove the actual used /Index entry + array_shift($index_blocks); + if (0 < \count($index_blocks)) { + // load the first object number of the following /Index entry + $obj_num = $index_blocks[0][0]; + } else { + // if there are no more entries, remove $index_blocks to avoid actions on an empty array + unset($index_blocks); + } + } + } + } + } // end decoding data + if (isset($prevxref)) { + // get previous xref + $xref = $this->getXrefData($pdfData, $prevxref, $xref, $visitedOffsets); + } + + return $xref; + } + + protected function getObjectHeaderPattern(array $objRefs): string + { + // consider all whitespace character (PDF specifications) + return '/'.$objRefs[0].$this->config->getPdfWhitespacesRegex().$objRefs[1].$this->config->getPdfWhitespacesRegex().'obj/'; + } + + protected function getObjectHeaderLen(array $objRefs): int + { + // "4 0 obj" + // 2 whitespaces + strlen("obj") = 5 + return 5 + \strlen($objRefs[0]) + \strlen($objRefs[1]); + } + + /** + * Get content of indirect object. + * + * @param string $pdfData PDF data + * @param string $objRef Object number and generation number separated by underscore character + * @param int $offset Object offset + * @param bool $decoding If true decode streams + * + * @return array containing object data + * + * @throws \Exception if invalid object reference found + */ + protected function getIndirectObject(string $pdfData, array $xref, string $objRef, int $offset = 0, bool $decoding = true): array + { + /* + * build indirect object header + */ + // $objHeader = "[object number] [generation number] obj" + $objRefArr = explode('_', $objRef); + if (2 !== \count($objRefArr)) { + throw new \Exception('Invalid object reference for $obj.'); + } + + $objHeaderLen = $this->getObjectHeaderLen($objRefArr); + + /* + * check if we are in position + */ + // ignore whitespace characters at offset + $offset += strspn($pdfData, $this->config->getPdfWhitespaces(), $offset); + // ignore leading zeros for object number + $offset += strspn($pdfData, '0', $offset); + if (0 == preg_match($this->getObjectHeaderPattern($objRefArr), substr($pdfData, $offset, $objHeaderLen))) { + // an indirect reference to an undefined object shall be considered a reference to the null object + return ['null', 'null', $offset]; + } + + /* + * get content + */ + // starting position of object content + $offset += $objHeaderLen; + $objContentArr = []; + $i = 0; // object main index + $header = null; + do { + $oldOffset = $offset; + // get element + $element = $this->getRawObject($pdfData, $offset, null != $header ? $header[1] : null); + $offset = $element[2]; + // decode stream using stream's dictionary information + if ($decoding && ('stream' === $element[0]) && null != $header) { + $element[3] = $this->decodeStream($pdfData, $xref, $header[1], $element[1]); + } + $objContentArr[$i] = $element; + $header = isset($element[0]) && '<<' === $element[0] ? $element : null; + ++$i; + } while (('endobj' !== $element[0]) && ($offset !== $oldOffset)); + // remove closing delimiter + array_pop($objContentArr); + + /* + * return raw object content + */ + return $objContentArr; + } + + /** + * Get the content of object, resolving indirect object reference if necessary. + * + * @param string $pdfData PDF data + * @param array $obj Object value + * + * @return array containing object data + * + * @throws \Exception + */ + protected function getObjectVal(string $pdfData, $xref, array $obj): array + { + if ('objref' == $obj[0]) { + // reference to indirect object + if (isset($this->objects[$obj[1]])) { + // this object has been already parsed + return $this->objects[$obj[1]]; + } elseif (isset($xref[$obj[1]])) { + // parse new object + $this->objects[$obj[1]] = $this->getIndirectObject($pdfData, $xref, $obj[1], $xref[$obj[1]], false); + + return $this->objects[$obj[1]]; + } + } + + return $obj; + } + + /** + * Get object type, raw value and offset to next object + * + * @param int $offset Object offset + * @param array|null $headerDic obj header's dictionary, parsed by getRawObject. Used for stream parsing optimization + * + * @return array containing object type, raw value and offset to next object + */ + protected function getRawObject(string $pdfData, int $offset = 0, ?array $headerDic = null): array + { + $objtype = ''; // object type to be returned + $objval = ''; // object value to be returned + + // skip initial white space chars + $offset += strspn($pdfData, $this->config->getPdfWhitespaces(), $offset); + + // get first char + $char = $pdfData[$offset]; + // get object type + switch ($char) { + case '%': // \x25 PERCENT SIGN + // skip comment and search for next token + $next = strcspn($pdfData, "\r\n", $offset); + if ($next > 0) { + $offset += $next; + + return $this->getRawObject($pdfData, $offset); + } + break; + + case '/': // \x2F SOLIDUS + // name object + $objtype = $char; + ++$offset; + $span = strcspn($pdfData, "\x00\x09\x0a\x0c\x0d\x20\n\t\r\v\f\x28\x29\x3c\x3e\x5b\x5d\x7b\x7d\x2f\x25", $offset, 256); + if ($span > 0) { + $objval = substr($pdfData, $offset, $span); // unescaped value + $offset += $span; + } + break; + + case '(': // \x28 LEFT PARENTHESIS + case ')': // \x29 RIGHT PARENTHESIS + // literal string object + $objtype = $char; + ++$offset; + $strpos = $offset; + if ('(' == $char) { + $open_bracket = 1; + while ($open_bracket > 0) { + if (!isset($pdfData[$strpos])) { + break; + } + $ch = $pdfData[$strpos]; + switch ($ch) { + case '\\': // REVERSE SOLIDUS (5Ch) (Backslash) + // skip next character + ++$strpos; + break; + + case '(': // LEFT PARENHESIS (28h) + ++$open_bracket; + break; + + case ')': // RIGHT PARENTHESIS (29h) + --$open_bracket; + break; + } + ++$strpos; + } + $objval = substr($pdfData, $offset, $strpos - $offset - 1); + $offset = $strpos; + } + break; + + case '[': // \x5B LEFT SQUARE BRACKET + case ']': // \x5D RIGHT SQUARE BRACKET + // array object + $objtype = $char; + ++$offset; + if ('[' == $char) { + // get array content + $objval = []; + do { + $oldOffset = $offset; + // get element + $element = $this->getRawObject($pdfData, $offset); + $offset = $element[2]; + $objval[] = $element; + } while ((']' != $element[0]) && ($offset != $oldOffset)); + // remove closing delimiter + array_pop($objval); + } + break; + + case '<': // \x3C LESS-THAN SIGN + case '>': // \x3E GREATER-THAN SIGN + if (isset($pdfData[$offset + 1]) && ($pdfData[$offset + 1] == $char)) { + // dictionary object + $objtype = $char.$char; + $offset += 2; + if ('<' == $char) { + // get array content + $objval = []; + do { + $oldOffset = $offset; + // get element + $element = $this->getRawObject($pdfData, $offset); + $offset = $element[2]; + $objval[] = $element; + } while (('>>' != $element[0]) && ($offset != $oldOffset)); + // remove closing delimiter + array_pop($objval); + } + } else { + // hexadecimal string object + $objtype = $char; + ++$offset; + + $span = strspn($pdfData, "0123456789abcdefABCDEF\x09\x0a\x0c\x0d\x20", $offset); + $dataToCheck = $pdfData[$offset + $span] ?? null; + if ('<' == $char && $span > 0 && '>' == $dataToCheck) { + // remove white space characters + $objval = strtr(substr($pdfData, $offset, $span), $this->config->getPdfWhitespaces(), ''); + $offset += $span + 1; + } elseif (false !== ($endpos = strpos($pdfData, '>', $offset))) { + $offset = $endpos + 1; + } + } + break; + + default: + if ('endobj' == substr($pdfData, $offset, 6)) { + // indirect object + $objtype = 'endobj'; + $offset += 6; + } elseif ('null' == substr($pdfData, $offset, 4)) { + // null object + $objtype = 'null'; + $offset += 4; + $objval = 'null'; + } elseif ('true' == substr($pdfData, $offset, 4)) { + // boolean true object + $objtype = 'boolean'; + $offset += 4; + $objval = 'true'; + } elseif ('false' == substr($pdfData, $offset, 5)) { + // boolean false object + $objtype = 'boolean'; + $offset += 5; + $objval = 'false'; + } elseif ('stream' == substr($pdfData, $offset, 6)) { + // start stream object + $objtype = 'stream'; + $offset += 6; + if (1 == preg_match('/^( *[\r]?[\n])/isU', substr($pdfData, $offset, 4), $matches)) { + $offset += \strlen($matches[0]); + + // we get stream length here to later help preg_match test less data + $streamLen = (int) $this->getHeaderValue($headerDic, 'Length', 'numeric', 0); + $skip = false === $this->config->getRetainImageContent() && 'XObject' == $this->getHeaderValue($headerDic, 'Type', '/') && 'Image' == $this->getHeaderValue($headerDic, 'Subtype', '/'); + + $pregResult = preg_match( + '/(endstream)[\x09\x0a\x0c\x0d\x20]/isU', + $pdfData, + $matches, + \PREG_OFFSET_CAPTURE, + $offset + $streamLen + ); + + if (1 == $pregResult) { + $objval = $skip ? '' : substr($pdfData, $offset, $matches[0][1] - $offset); + $offset = $matches[1][1]; + } + } + } elseif ('endstream' == substr($pdfData, $offset, 9)) { + // end stream object + $objtype = 'endstream'; + $offset += 9; + } elseif (1 == preg_match('/^([0-9]+)[\s]+([0-9]+)[\s]+R/iU', substr($pdfData, $offset, 33), $matches)) { + // indirect object reference + $objtype = 'objref'; + $offset += \strlen($matches[0]); + $objval = (int) $matches[1].'_'.(int) $matches[2]; + } elseif (1 == preg_match('/^([0-9]+)[\s]+([0-9]+)[\s]+obj/iU', substr($pdfData, $offset, 33), $matches)) { + // object start + $objtype = 'obj'; + $objval = (int) $matches[1].'_'.(int) $matches[2]; + $offset += \strlen($matches[0]); + } elseif (($numlen = strspn($pdfData, '+-.0123456789', $offset)) > 0) { + // numeric object + $objtype = 'numeric'; + $objval = substr($pdfData, $offset, $numlen); + $offset += $numlen; + } + break; + } + + return [$objtype, $objval, $offset]; + } + + /** + * Get value of an object header's section (obj << YYY >> part ). + * + * It is similar to Header::get('...')->getContent(), the only difference is it can be used during the parsing process, + * when no Smalot\PdfParser\Header objects are created yet. + * + * @param string $key header's section name + * @param string $type type of the section (i.e. 'numeric', '/', '<<', etc.) + * @param string|array|null $default default value for header's section + * + * @return string|array|null value of obj header's section, or default value if none found, or its type doesn't match $type param + */ + private function getHeaderValue(?array $headerDic, string $key, string $type, $default = '') + { + if (false === \is_array($headerDic)) { + return $default; + } + + /* + * It recieves dictionary of header fields, as it is returned by RawDataParser::getRawObject, + * iterates over it, searching for section of type '/' whith requested key. + * If such a section is found, it tries to receive it's value (next object in dictionary), + * returning it, if it matches requested type, or default value otherwise. + */ + foreach ($headerDic as $i => $val) { + $isSectionName = \is_array($val) && 3 == \count($val) && '/' == $val[0]; + if ( + $isSectionName + && $val[1] == $key + && isset($headerDic[$i + 1]) + ) { + $isSectionValue = \is_array($headerDic[$i + 1]) && 1 < \count($headerDic[$i + 1]); + + return $isSectionValue && $type == $headerDic[$i + 1][0] + ? $headerDic[$i + 1][1] + : $default; + } + } + + return $default; + } + + /** + * Get Cross-Reference (xref) table and trailer data from PDF document data. + * + * @param int $offset xref offset (if known) + * @param array $xref previous xref array (if any) + * @param array $visitedOffsets array of visited offsets to prevent infinite loops + * + * @return array containing xref and trailer data + * + * @throws \Exception if it was unable to find startxref + * @throws \Exception if it was unable to find xref + */ + protected function getXrefData(string $pdfData, int $offset = 0, array $xref = [], array $visitedOffsets = []): array + { + // Check for circular references to prevent infinite loops + if (\in_array($offset, $visitedOffsets, true)) { + // We've already processed this offset, skip to avoid infinite loop + return $xref; + } + + // Track this offset as visited + $visitedOffsets[] = $offset; + // If the $offset is currently pointed at whitespace, bump it + // forward until it isn't; affects loosely targetted offsets + // for the 'xref' keyword + // See: https://github.com/smalot/pdfparser/issues/673 + $bumpOffset = $offset; + while (preg_match('/\s/', substr($pdfData, $bumpOffset, 1))) { + ++$bumpOffset; + } + + // Find all startxref tables from this $offset forward + $startxrefPreg = preg_match_all( + '/(?<=[\r\n])startxref[\s]*[\r\n]+([0-9]+)[\s]*[\r\n]+%%EOF/i', + $pdfData, + $startxrefMatches, + \PREG_SET_ORDER, + $offset + ); + + if (0 == $startxrefPreg) { + // No startxref tables were found + throw new \Exception('Unable to find startxref'); + } elseif (0 == $offset) { + // Use the last startxref in the document + $startxref = (int) $startxrefMatches[\count($startxrefMatches) - 1][1]; + } elseif (strpos($pdfData, 'xref', $bumpOffset) == $bumpOffset) { + // Already pointing at the xref table + $startxref = $bumpOffset; + } elseif (preg_match('/([0-9]+[\s][0-9]+[\s]obj)/i', $pdfData, $matches, 0, $bumpOffset)) { + // Cross-Reference Stream object + $startxref = $bumpOffset; + } else { + // Use the next startxref from this $offset + $startxref = (int) $startxrefMatches[0][1]; + } + + if ($startxref > \strlen($pdfData)) { + throw new \Exception('Unable to find xref (PDF corrupted?)'); + } + + // check xref position + if (strpos($pdfData, 'xref', $startxref) == $startxref) { + // Cross-Reference + $xref = $this->decodeXref($pdfData, $startxref, $xref, $visitedOffsets); + } else { + // Check if the $pdfData might have the wrong line-endings + $pdfDataUnix = str_replace("\r\n", "\n", $pdfData); + if ($startxref < \strlen($pdfDataUnix) && strpos($pdfDataUnix, 'xref', $startxref) == $startxref) { + // Return Unix-line-ending flag + $xref = ['Unix' => true]; + } else { + // Cross-Reference Stream + $xref = $this->decodeXrefStream($pdfData, $startxref, $xref, $visitedOffsets); + } + } + if (empty($xref)) { + throw new \Exception('Unable to find xref'); + } + + return $xref; + } + + /** + * Parses PDF data and returns extracted data as array. + * + * @param string $data PDF data to parse + * + * @return array array of parsed PDF document objects + * + * @throws EmptyPdfException if empty PDF data given + * @throws MissingPdfHeaderException if PDF data missing `%PDF-` header + */ + public function parseData(string $data): array + { + if (empty($data)) { + throw new EmptyPdfException('Empty PDF data given.'); + } + // find the pdf header starting position + if (false === ($trimpos = strpos($data, '%PDF-'))) { + throw new MissingPdfHeaderException('Invalid PDF data: Missing `%PDF-` header.'); + } + + // get PDF content string + $pdfData = $trimpos > 0 ? substr($data, $trimpos) : $data; + + // get xref and trailer data + $xref = $this->getXrefData($pdfData); + + // If we found Unix line-endings + if (isset($xref['Unix'])) { + $pdfData = str_replace("\r\n", "\n", $pdfData); + $xref = $this->getXrefData($pdfData); + } + + // parse all document objects + $objects = []; + foreach ($xref['xref'] as $obj => $offset) { + if (!isset($objects[$obj]) && ($offset > 0)) { + // decode objects with positive offset + $objects[$obj] = $this->getIndirectObject($pdfData, $xref, $obj, $offset, true); + } + } + + return [$xref, $objects]; + } +} diff --git a/src/Smalot/PdfParser/Tests/Units/Document.php b/src/Smalot/PdfParser/Tests/Units/Document.php deleted file mode 100644 index 684403f9..00000000 --- a/src/Smalot/PdfParser/Tests/Units/Document.php +++ /dev/null @@ -1,184 +0,0 @@ - - * @date 2013-08-08 - * @license GPL-3.0 - * @url - * - * PdfParser is a pdf library written in PHP, extraction oriented. - * Copyright (C) 2014 - Sébastien MALOT - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. - * If not, see . - * - */ - -namespace Smalot\PdfParser\Tests\Units; - -use mageekguy\atoum; - -/** - * Class Document - * - * @package Smalot\PdfParser\Tests\Units - */ -class Document extends atoum\test -{ - public function testSetObjects() - { - $document = new \Smalot\PdfParser\Document(); - $object = new \Smalot\PdfParser\Object($document); - // Obj #1 is missing - $this->assert->variable($document->getObjectById(1))->isNull(); - $document->setObjects(array(1 => $object)); - // Obj #1 exists - $this->assert->object($document->getObjectById(1))->isInstanceOf('\Smalot\PdfParser\Object'); - - $content = '<>'; - $header = \Smalot\PdfParser\Header::parse($content, $document); - $object = new \Smalot\PdfParser\Object($document, $header); - $document->setObjects(array(2 => $object)); - // Obj #1 is missing - $this->assert->assert->variable($document->getObjectById(1))->isNull(); - // Obj #2 exists - $this->assert->object($document->getObjectById(2))->isInstanceOf('\Smalot\PdfParser\Object'); - } - - public function testGetObjects() - { - $document = new \Smalot\PdfParser\Document(); - $object1 = new \Smalot\PdfParser\Object($document); - $content = '<>unparsed content'; - $header = \Smalot\PdfParser\Header::parse($content, $document); - - $object2 = new \Smalot\PdfParser\Page($document, $header); - $document->setObjects(array(1 => $object1, 2 => $object2)); - - $this->assert->integer(count($objects = $document->getObjects()))->isEqualTo(2); - $this->assert->object($objects[1])->isInstanceOf('\Smalot\PdfParser\Object'); - $this->assert->object($objects[2])->isInstanceOf('\Smalot\PdfParser\Object'); - $this->assert->object($objects[2])->isInstanceOf('\Smalot\PdfParser\Page'); - } - - public function testDictionary() - { - $document = new \Smalot\PdfParser\Document(); - $this->assert->integer(count($objects = $document->getDictionary()))->isEqualTo(0); - $object1 = new \Smalot\PdfParser\Object($document); - $content = '<>'; - $header = \Smalot\PdfParser\Header::parse($content, $document); - $object2 = new \Smalot\PdfParser\Page($document, $header); - $document->setObjects(array(1 => $object1, 2 => $object2)); - $this->assert->integer(count($objects = $document->getDictionary()))->isEqualTo(1); - $this->assert->integer(count($objects['Page']))->isEqualTo(1); - $this->assert->integer($objects['Page'][2])->isEqualTo(2); - } - - public function testGetObjectsByType() - { - $document = new \Smalot\PdfParser\Document(); - $object1 = new \Smalot\PdfParser\Object($document); - $content = '<>'; - $header = \Smalot\PdfParser\Header::parse($content, $document); - $object2 = new \Smalot\PdfParser\Page($document, $header); - $document->setObjects(array(1 => $object1, 2 => $object2)); - $this->assert->integer(count($objects = $document->getObjectsByType('Page')))->isEqualTo(1); - $this->assert->object($objects[2])->isInstanceOf('\Smalot\PdfParser\Object'); - $this->assert->object($objects[2])->isInstanceOf('\Smalot\PdfParser\Page'); - } - - public function testGetPages() - { - // Missing catalog - $document = new \Smalot\PdfParser\Document(); - try { - $pages = $document->getPages(); - $this->assert->boolean($pages)->isEqualTo(false); - } catch (\Exception $e) { - $this->assert->object($e)->isInstanceOf('\Exception'); - } - - // Listing pages from type Page - $content = '<>'; - $header = \Smalot\PdfParser\Header::parse($content, $document); - $object1 = new \Smalot\PdfParser\Page($document, $header); - $header = \Smalot\PdfParser\Header::parse($content, $document); - $object2 = new \Smalot\PdfParser\Page($document, $header); - $document->setObjects(array(1 => $object1, 2 => $object2)); - $pages = $document->getPages(); - $this->assert->integer(count($pages))->isEqualTo(2); - $this->assert->object($pages[0])->isInstanceOf('\Smalot\PdfParser\Page'); - $this->assert->object($pages[1])->isInstanceOf('\Smalot\PdfParser\Page'); - - // Listing pages from type Pages (kids) - $content = '<>'; - $header = \Smalot\PdfParser\Header::parse($content, $document); - $object1 = new \Smalot\PdfParser\Page($document, $header); - $header = \Smalot\PdfParser\Header::parse($content, $document); - $object2 = new \Smalot\PdfParser\Page($document, $header); - $header = \Smalot\PdfParser\Header::parse($content, $document); - $object3 = new \Smalot\PdfParser\Page($document, $header); - $content = '<>'; - $header = \Smalot\PdfParser\Header::parse($content, $document); - $object4 = new \Smalot\PdfParser\Pages($document, $header); - $content = '<>'; - $header = \Smalot\PdfParser\Header::parse($content, $document); - $object5 = new \Smalot\PdfParser\Pages($document, $header); - $document->setObjects( - array('1_0' => $object1, '2_0' => $object2, '3_0' => $object3, '4_0' => $object4, '5_0' => $object5) - ); - $pages = $document->getPages(); - $this->assert->integer(count($pages))->isEqualTo(3); - $this->assert->object($pages[0])->isInstanceOf('\Smalot\PdfParser\Page'); - $this->assert->object($pages[1])->isInstanceOf('\Smalot\PdfParser\Page'); - $this->assert->object($pages[2])->isInstanceOf('\Smalot\PdfParser\Page'); - - // Listing pages from type Catalog - $content = '<>'; - $header = \Smalot\PdfParser\Header::parse($content, $document); - $object1 = new \Smalot\PdfParser\Page($document, $header); - $header = \Smalot\PdfParser\Header::parse($content, $document); - $object2 = new \Smalot\PdfParser\Page($document, $header); - $header = \Smalot\PdfParser\Header::parse($content, $document); - $object3 = new \Smalot\PdfParser\Page($document, $header); - $content = '<>'; - $header = \Smalot\PdfParser\Header::parse($content, $document); - $object4 = new \Smalot\PdfParser\Pages($document, $header); - $content = '<>'; - $header = \Smalot\PdfParser\Header::parse($content, $document); - $object5 = new \Smalot\PdfParser\Pages($document, $header); - $content = '<>'; - $header = \Smalot\PdfParser\Header::parse($content, $document); - $object6 = new \Smalot\PdfParser\Pages($document, $header); - $document->setObjects( - array( - '1_0' => $object1, - '2_0' => $object2, - '3_0' => $object3, - '4_0' => $object4, - '5_0' => $object5, - '6_0' => $object6 - ) - ); - $pages = $document->getPages(); - $this->assert->integer(count($pages))->isEqualTo(3); - $this->assert->object($pages[0])->isInstanceOf('\Smalot\PdfParser\Page'); - $this->assert->object($pages[1])->isInstanceOf('\Smalot\PdfParser\Page'); - $this->assert->object($pages[2])->isInstanceOf('\Smalot\PdfParser\Page'); - } -} diff --git a/src/Smalot/PdfParser/Tests/Units/Element.php b/src/Smalot/PdfParser/Tests/Units/Element.php deleted file mode 100644 index 0e3535a5..00000000 --- a/src/Smalot/PdfParser/Tests/Units/Element.php +++ /dev/null @@ -1,154 +0,0 @@ - - * @date 2013-08-08 - * @license GPL-3.0 - * @url - * - * PdfParser is a pdf library written in PHP, extraction oriented. - * Copyright (C) 2014 - Sébastien MALOT - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. - * If not, see . - * - */ - -namespace Smalot\PdfParser\Tests\Units; - -use mageekguy\atoum; - -/** - * Class Element - * - * @package Smalot\PdfParser\Tests\Units - */ -class Element extends atoum\test -{ - public function testParse() - { - $document = new \Smalot\PdfParser\Document(array()); - - // Only_values = false. - $content = '/NameType /FlateDecode - /Contents[4 0 R 42]/Fonts<>/NullType - null/StringType(hello)/DateType(D:20130901235555+02\'00\')/XRefType 2 0 R - /NumericType 8/HexaType<0020>/BooleanType false'; - $offset = 0; - $elements = \Smalot\PdfParser\Element::parse($content, $document, $offset, false); - - $this->assert->array($elements)->hasKey('NameType'); - $this->assert->object($elements['NameType'])->isInstanceOf('\Smalot\PdfParser\Element\ElementName'); - $this->assert->string($elements['NameType']->getContent())->isEqualTo('FlateDecode'); - - $this->assert->boolean(array_key_exists('Contents', $elements))->isEqualTo(true); - $this->assert->object($elements['Contents'])->isInstanceOf('\Smalot\PdfParser\Element\ElementArray'); - $this->assert->boolean($elements['Contents']->contains(42))->isEqualTo(true); - - $this->assert->boolean(array_key_exists('Fonts', $elements))->isEqualTo(true); - $this->assert->object($elements['Fonts'])->isInstanceOf('\Smalot\PdfParser\Header'); - - $this->assert->boolean(array_key_exists('NullType', $elements))->isEqualTo(true); - $this->assert->object($elements['NullType'])->isInstanceOf('\Smalot\PdfParser\Element\ElementNull'); - $this->assert->castToString($elements['NullType'])->isEqualTo('null'); - - $this->assert->boolean(array_key_exists('StringType', $elements))->isEqualTo(true); - $this->assert->object($elements['StringType'])->isInstanceOf('\Smalot\PdfParser\Element\ElementString'); - $this->assert->string($elements['StringType']->getContent())->isEqualTo('hello'); - - $this->assert->boolean(array_key_exists('DateType', $elements))->isEqualTo(true); - $this->assert->object($elements['DateType'])->isInstanceOf('\Smalot\PdfParser\Element\ElementDate'); -// $this->assert->castToString($elements['DateType'])->isEqualTo('2013-09-01T23:55:55+02:00'); - - $this->assert->boolean(array_key_exists('XRefType', $elements))->isEqualTo(true); - $this->assert->object($elements['XRefType'])->isInstanceOf('\Smalot\PdfParser\Element\ElementXRef'); - $this->assert->string($elements['XRefType']->getId())->isEqualTo('2_0'); - - $this->assert->boolean(array_key_exists('NumericType', $elements))->isEqualTo(true); - $this->assert->object($elements['NumericType'])->isInstanceOf('\Smalot\PdfParser\Element\ElementNumeric'); - $this->assert->castToString($elements['NumericType'])->isEqualTo('8'); - - $this->assert->boolean(array_key_exists('HexaType', $elements))->isEqualTo(true); - $this->assert->object($elements['HexaType'])->isInstanceOf('\Smalot\PdfParser\Element\ElementString'); - $this->assert->string($elements['HexaType']->getContent())->isEqualTo(' '); - - $this->assert->boolean(array_key_exists('BooleanType', $elements))->isEqualTo(true); - $this->assert->object($elements['BooleanType'])->isInstanceOf('\Smalot\PdfParser\Element\ElementBoolean'); - $this->assert->boolean($elements['BooleanType']->getContent())->isEqualTo(false); - - // Only_values = true. - $content = '/NameType /FlateDecode'; - $offset = 0; - $elements = \Smalot\PdfParser\Element::parse($content, $document, $offset, true); - $this->assert->array($elements)->hasSize(2); - $this->assert->integer($offset)->isEqualTo(22); - - // Test error. - $content = '/NameType /FlateDecode $$$'; - $offset = 0; - $elements = \Smalot\PdfParser\Element::parse($content, $document, $offset, false); - $this->assert->array($elements)->hasSize(1); - $this->assert->integer($offset)->isEqualTo(22); - $this->assert->string(key($elements))->isEqualTo('NameType'); - $this->assert->object(current($elements))->isInstanceOf('\Smalot\PdfParser\Element\ElementName'); - - $content = '/NameType $$$'; - $offset = 0; - $elements = \Smalot\PdfParser\Element::parse($content, $document, $offset, false); - $this->assert->integer($offset)->isEqualTo(0); - $this->assert->array($elements)->isEmpty(); - - /*$this->assert->boolean(array_key_exists('NameType', $elements))->isEqualTo(true); - $this->assert->boolean($elements['NameType'])->isInstanceOf('\Smalot\PdfParser\Element\ElementName)->isEqualTo(true); - $this->assert->string($elements['NameType']->getContent())->isEqualTo('FlateDecode');*/ - } - - public function testGetContent() - { - $element = new \Smalot\PdfParser\Element(42); - $content = $element->getContent(); - $this->assert->integer($content)->isEqualTo(42); - - $element = new \Smalot\PdfParser\Element(array(4, 2)); - $content = $element->getContent(); - $this->assert->array($content)->hasSize(2); - } - - public function testEquals() - { - $element = new \Smalot\PdfParser\Element(2); - - $this->assert->boolean($element->equals(2))->isEqualTo(true); - $this->assert->boolean($element->equals(8))->isEqualTo(false); - } - - public function testContains() - { - $val_4 = new \Smalot\PdfParser\Element(4); - $val_2 = new \Smalot\PdfParser\Element(2); - $element = new \Smalot\PdfParser\Element(array($val_4, $val_2)); - - $this->assert->boolean($element->contains(2))->isEqualTo(true); - $this->assert->boolean($element->contains(8))->isEqualTo(false); - } - - public function test__toString() - { - $element = new \Smalot\PdfParser\Element(2); - $this->assert->castToString($element)->isEqualTo('2'); - } -} diff --git a/src/Smalot/PdfParser/Tests/Units/Element/ElementArray.php b/src/Smalot/PdfParser/Tests/Units/Element/ElementArray.php deleted file mode 100644 index 971f3e11..00000000 --- a/src/Smalot/PdfParser/Tests/Units/Element/ElementArray.php +++ /dev/null @@ -1,189 +0,0 @@ - - * @date 2013-08-08 - * @license GPL-3.0 - * @url - * - * PdfParser is a pdf library written in PHP, extraction oriented. - * Copyright (C) 2014 - Sébastien MALOT - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. - * If not, see . - * - */ - -namespace Smalot\PdfParser\Tests\Units\Element; - -use mageekguy\atoum; -use Smalot\PdfParser\Document; -use Smalot\PdfParser\Header; -use Smalot\PdfParser\Page; - -/** - * Class ElementArray - * - * @package Smalot\PdfParser\Tests\Units\Element - */ -class ElementArray extends atoum\test -{ - public function testParse() - { - $document = new \Smalot\PdfParser\Document(array()); - - // Skipped. - $offset = 0; - $element = \Smalot\PdfParser\Element\ElementArray::parse('ABC', $document, $offset); - $this->assert->boolean($element)->isEqualTo(false); - $this->assert->integer($offset)->isEqualTo(0); - $offset = 0; - $element = \Smalot\PdfParser\Element\ElementArray::parse(' / [ 4 2 ] ', $document, $offset); - $this->assert->boolean($element)->isEqualTo(false); - $this->assert->integer($offset)->isEqualTo(0); - $offset = 0; - $element = \Smalot\PdfParser\Element\ElementArray::parse(' 0 [ 4 2 ] ', $document, $offset); - $this->assert->boolean($element)->isEqualTo(false); - $this->assert->integer($offset)->isEqualTo(0); - $offset = 0; - $element = \Smalot\PdfParser\Element\ElementArray::parse(" 0 \n [ 4 2 ] ", $document, $offset); - $this->assert->boolean($element)->isEqualTo(false); - $this->assert->integer($offset)->isEqualTo(0); - - // Valid. - $offset = 0; - $element = \Smalot\PdfParser\Element\ElementArray::parse(' [ 4 2 ] ', $document, $offset); - $this->assert->boolean($element->contains(4))->isEqualTo(true); - $this->assert->boolean($element->contains(2))->isEqualTo(true); - $this->assert->boolean($element->contains(8))->isEqualTo(false); - $this->assert->integer($offset)->isEqualTo(8); - $offset = 0; - $element = \Smalot\PdfParser\Element\ElementArray::parse(' [ 4 2 ]', $document, $offset); - $this->assert->boolean($element->contains(4))->isEqualTo(true); - $this->assert->boolean($element->contains(2))->isEqualTo(true); - $this->assert->boolean($element->contains(8))->isEqualTo(false); - $this->assert->integer($offset)->isEqualTo(8); - $offset = 0; - $element = \Smalot\PdfParser\Element\ElementArray::parse('[ 4 2 ]', $document, $offset); - $this->assert->boolean($element->contains(4))->isEqualTo(true); - $this->assert->boolean($element->contains(2))->isEqualTo(true); - $this->assert->boolean($element->contains(8))->isEqualTo(false); - $this->assert->integer($offset)->isEqualTo(7); - $offset = 0; - $element = \Smalot\PdfParser\Element\ElementArray::parse(" \n [ 4 2 ] ", $document, $offset); - $this->assert->boolean($element->contains(4))->isEqualTo(true); - $this->assert->boolean($element->contains(2))->isEqualTo(true); - $this->assert->boolean($element->contains(8))->isEqualTo(false); - $this->assert->integer($offset)->isEqualTo(10); - } - - public function testGetContent() - { - $val_4 = new \Smalot\PdfParser\Element\ElementNumeric('4'); - $val_2 = new \Smalot\PdfParser\Element\ElementNumeric('2'); - $element = new \Smalot\PdfParser\Element\ElementArray(array($val_4, $val_2)); - - $content = $element->getContent(); - $this->assert->array($content)->hasSize(2); - } - - public function testContains() - { - $val_4 = new \Smalot\PdfParser\Element\ElementNumeric('4'); - $val_2 = new \Smalot\PdfParser\Element\ElementNumeric('2'); - $element = new \Smalot\PdfParser\Element\ElementArray(array($val_4, $val_2)); - - $this->assert->boolean($element->contains(2))->isEqualTo(true); - $this->assert->boolean($element->contains(8))->isEqualTo(false); - } - - public function testResolveXRef() - { - // Document with text. - $filename = __DIR__ . '/../../../../../../samples/Document1_pdfcreator_nocompressed.pdf'; - $parser = new \Smalot\PdfParser\Parser(); - $document = $parser->parseFile($filename); - $object = $document->getObjectById('3_0'); - $kids = $object->get('Kids'); - - $this->assert->object($kids)->isInstanceOf('\Smalot\PdfParser\Element\ElementArray'); - $this->assert->array($kids->getContent())->hasSize(1); - - $pages = $kids->getContent(); - $this->assert->object(reset($pages))->isInstanceOf('\Smalot\PdfParser\Page'); - } - - public function testGetDetails() - { -// // Document with text. -// $filename = __DIR__ . '/../../../../../../samples/Document1_pdfcreator_nocompressed.pdf'; -// $parser = new \Smalot\PdfParser\Parser(); -// $document = $parser->parseFile($filename); -// $object = $document->getObjectById('3_0'); -// /** @var \Smalot\PdfParser\Element\ElementArray $kids */ -// $kids = $object->get('Kids'); -// $details = $kids->getDetails(); -// -// $this->assert->array($details)->hasSize(1); -// $this->assert->string($details[0]['Type'])->isEqualTo('Page'); - - $document = new Document(); - $content = '<> [8 [9 <>]]]>>'; - $details_reference = array( - 'Type' => 'Page', - 'Types' => array( - 8, - ), - 'Sizes' => array( - 1, - 2, - 3, - 4, - 5, - array( - 'Subtype' => 'XObject', - ), - array( - 8, - array( - 9, - array( - 'FontSize' => 10, - ), - ), - ), - ), - ); - $header = Header::parse($content, $document); - $details = $header->getDetails(); - - $this->assert->array($details)->hasSize(3); - $this->assert->array($details)->isEqualTo($details_reference); - } - - public function test__toString() - { - $val_4 = new \Smalot\PdfParser\Element\ElementNumeric('4'); - $val_2 = new \Smalot\PdfParser\Element\ElementNumeric('2'); - $element = new \Smalot\PdfParser\Element\ElementArray(array($val_4, $val_2)); - $this->assert->castToString($element)->isEqualTo('4,2'); - - $document = new \Smalot\PdfParser\Document(array()); - $element = \Smalot\PdfParser\Element\ElementArray::parse(' [ 4 2 ]', $document); - $this->assert->castToString($element)->isEqualTo('4,2'); - } -} diff --git a/src/Smalot/PdfParser/Tests/Units/Element/ElementBoolean.php b/src/Smalot/PdfParser/Tests/Units/Element/ElementBoolean.php deleted file mode 100644 index a2f75c0f..00000000 --- a/src/Smalot/PdfParser/Tests/Units/Element/ElementBoolean.php +++ /dev/null @@ -1,135 +0,0 @@ - - * @date 2013-08-08 - * @license GPL-3.0 - * @url - * - * PdfParser is a pdf library written in PHP, extraction oriented. - * Copyright (C) 2014 - Sébastien MALOT - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. - * If not, see . - * - */ - -namespace Smalot\PdfParser\Tests\Units\Element; - -use mageekguy\atoum; - -/** - * Class ElementBoolean - * - * @package Smalot\PdfParser\Tests\Units\Element - */ -class ElementBoolean extends atoum\test -{ - public function testParse() - { - // Skipped. - $offset = 0; - $element = \Smalot\PdfParser\Element\ElementBoolean::parse('ABC', null, $offset); - $this->assert->boolean($element)->isEqualTo(false); - $this->assert->integer($offset)->isEqualTo(0); - $offset = 0; - $element = \Smalot\PdfParser\Element\ElementBoolean::parse(' [ false ]', null, $offset); - $this->assert->boolean($element)->isEqualTo(false); - $this->assert->integer($offset)->isEqualTo(0); - $offset = 0; - $element = \Smalot\PdfParser\Element\ElementBoolean::parse(' << true >>', null, $offset); - $this->assert->boolean($element)->isEqualTo(false); - $this->assert->integer($offset)->isEqualTo(0); - $offset = 0; - $element = \Smalot\PdfParser\Element\ElementBoolean::parse(' / false ', null, $offset); - $this->assert->boolean($element)->isEqualTo(false); - $this->assert->integer($offset)->isEqualTo(0); - $offset = 0; - $element = \Smalot\PdfParser\Element\ElementBoolean::parse(' 0 true ', null, $offset); - $this->assert->boolean($element)->isEqualTo(false); - $this->assert->integer($offset)->isEqualTo(0); - $offset = 0; - $element = \Smalot\PdfParser\Element\ElementBoolean::parse(" 0 \n true ", null, $offset); - $this->assert->boolean($element)->isEqualTo(false); - $this->assert->integer($offset)->isEqualTo(0); - - // Valid. - $offset = 0; - $element = \Smalot\PdfParser\Element\ElementBoolean::parse(' true ', null, $offset); - $this->assert->boolean($element->getContent())->isEqualTo(true); - $this->assert->integer($offset)->isEqualTo(5); - $offset = 0; - $element = \Smalot\PdfParser\Element\ElementBoolean::parse(' TRUE ', null, $offset); - $this->assert->boolean($element->getContent())->isEqualTo(true); - $this->assert->integer($offset)->isEqualTo(5); - $offset = 0; - $element = \Smalot\PdfParser\Element\ElementBoolean::parse(' True', null, $offset); - $this->assert->boolean($element->getContent())->isEqualTo(true); - $this->assert->integer($offset)->isEqualTo(5); - $offset = 0; - $element = \Smalot\PdfParser\Element\ElementBoolean::parse('true', null, $offset); - $this->assert->boolean($element->getContent())->isEqualTo(true); - $this->assert->integer($offset)->isEqualTo(4); - $offset = 0; - $element = \Smalot\PdfParser\Element\ElementBoolean::parse('False', null, $offset); - $this->assert->boolean($element->getContent())->isEqualTo(false); - $this->assert->integer($offset)->isEqualTo(5); - $offset = 0; - $element = \Smalot\PdfParser\Element\ElementBoolean::parse(" \n true ", null, $offset); - $this->assert->boolean($element->getContent())->isEqualTo(true); - $this->assert->integer($offset)->isEqualTo(7); - } - - public function testGetContent() - { - $element = new \Smalot\PdfParser\Element\ElementBoolean('true'); - $this->assert->boolean($element->getContent())->isEqualTo(true); - $element = new \Smalot\PdfParser\Element\ElementBoolean('false'); - $this->assert->boolean($element->getContent())->isEqualTo(false); - } - - public function testEquals() - { - $element = new \Smalot\PdfParser\Element\ElementBoolean('true'); - $this->assert->boolean($element->equals(true))->isEqualTo(true); - $this->assert->boolean($element->equals(1))->isEqualTo(false); - $this->assert->boolean($element->equals(false))->isEqualTo(false); - $this->assert->boolean($element->equals(null))->isEqualTo(false); - - $element = new \Smalot\PdfParser\Element\ElementBoolean('false'); - $this->assert->boolean($element->equals(false))->isEqualTo(true); - $this->assert->boolean($element->equals(0))->isEqualTo(false); - $this->assert->boolean($element->equals(true))->isEqualTo(false); - $this->assert->boolean($element->equals(null))->isEqualTo(false); - } - - public function testContains() - { - $element = new \Smalot\PdfParser\Element\ElementBoolean('true'); - $this->assert->boolean($element->contains(true))->isEqualTo(true); - $this->assert->boolean($element->contains(false))->isEqualTo(false); - $this->assert->boolean($element->contains(1))->isEqualTo(false); - } - - public function test__toString() - { - $element = new \Smalot\PdfParser\Element\ElementBoolean('true'); - $this->assert->castToString($element)->isEqualTo('true'); - $element = new \Smalot\PdfParser\Element\ElementBoolean('false'); - $this->assert->castToString($element)->isEqualTo('false'); - } -} diff --git a/src/Smalot/PdfParser/Tests/Units/Element/ElementDate.php b/src/Smalot/PdfParser/Tests/Units/Element/ElementDate.php deleted file mode 100644 index a9f18b49..00000000 --- a/src/Smalot/PdfParser/Tests/Units/Element/ElementDate.php +++ /dev/null @@ -1,164 +0,0 @@ - - * @date 2013-08-08 - * @license GPL-3.0 - * @url - * - * PdfParser is a pdf library written in PHP, extraction oriented. - * Copyright (C) 2014 - Sébastien MALOT - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. - * If not, see . - * - */ - -namespace Smalot\PdfParser\Tests\Units\Element; - -use mageekguy\atoum; - -/** - * Class ElementDate - * - * @package Smalot\PdfParser\Tests\Units\Element - */ -class ElementDate extends atoum\test -{ - public function testParse() - { - // Skipped. - $offset = 0; - $element = \Smalot\PdfParser\Element\ElementDate::parse('ABC', null, $offset); - $this->assert->boolean($element)->isEqualTo(false); - $this->assert->integer($offset)->isEqualTo(0); - $offset = 0; - $element = \Smalot\PdfParser\Element\ElementDate::parse(' [ (ABC) 5 6 ]', null, $offset); - $this->assert->boolean($element)->isEqualTo(false); - $this->assert->integer($offset)->isEqualTo(0); - $offset = 0; - $element = \Smalot\PdfParser\Element\ElementDate::parse(' << (invalid) >>', null, $offset); - $this->assert->boolean($element)->isEqualTo(false); - $this->assert->integer($offset)->isEqualTo(0); - $offset = 0; - $element = \Smalot\PdfParser\Element\ElementDate::parse(' / (FlateDecode) ', null, $offset); - $this->assert->boolean($element)->isEqualTo(false); - $this->assert->integer($offset)->isEqualTo(0); - $offset = 0; - $element = \Smalot\PdfParser\Element\ElementDate::parse(' 0 (FlateDecode) ', null, $offset); - $this->assert->boolean($element)->isEqualTo(false); - $this->assert->integer($offset)->isEqualTo(0); - $offset = 0; - $element = \Smalot\PdfParser\Element\ElementDate::parse(" 0 \n (FlateDecode) ", null, $offset); - $this->assert->boolean($element)->isEqualTo(false); - $this->assert->integer($offset)->isEqualTo(0); - - // Valid. - $offset = 0; - $element = \Smalot\PdfParser\Element\ElementDate::parse(' (D:20130901235555+02\'00\') ', null, $offset); - $element->setFormat('c'); - $this->assert->object($element->getContent())->isInstanceOf('\DateTime'); - $this->assert->castToString($element)->isEqualTo('2013-09-01T23:55:55+02:00'); - $this->assert->integer($offset)->isEqualTo(26); - $offset = 0; - $element = \Smalot\PdfParser\Element\ElementDate::parse(' (D:20130901235555+02\'00\') ', null, $offset); - $element->setFormat('c'); - $this->assert->object($element->getContent())->isInstanceOf('\DateTime'); - $this->assert->castToString($element)->isEqualTo('2013-09-01T23:55:55+02:00'); - $this->assert->integer($offset)->isEqualTo(26); - $offset = 0; - $element = \Smalot\PdfParser\Element\ElementDate::parse(' (D:20130901235555+02\'00\')', null, $offset); - $element->setFormat('c'); - $this->assert->object($element->getContent())->isInstanceOf('\DateTime'); - $this->assert->castToString($element)->isEqualTo('2013-09-01T23:55:55+02:00'); - $this->assert->integer($offset)->isEqualTo(26); - $offset = 0; - $element = \Smalot\PdfParser\Element\ElementDate::parse('(D:20130901235555+02\'00\')', null, $offset); - $element->setFormat('c'); - $this->assert->object($element->getContent())->isInstanceOf('\DateTime'); - $this->assert->castToString($element)->isEqualTo('2013-09-01T23:55:55+02:00'); - $this->assert->integer($offset)->isEqualTo(25); - $offset = 0; - $element = \Smalot\PdfParser\Element\ElementDate::parse(" \n (D:20130901235555+02'00') ", null, $offset); - $element->setFormat('c'); - $this->assert->object($element->getContent())->isInstanceOf('\DateTime'); - $this->assert->castToString($element)->isEqualTo('2013-09-01T23:55:55+02:00'); - $this->assert->integer($offset)->isEqualTo(28); - $offset = 0; - $element = \Smalot\PdfParser\Element\ElementDate::parse(" \n (D:20130901235555) ", null, $offset); - $element->setFormat('c'); - $this->assert->object($element->getContent())->isInstanceOf('\DateTime'); - $this->assert->boolean($element->equals(new \DateTime('2013-09-01T23:55:55')))->isEqualTo(true); - $this->assert->integer($offset)->isEqualTo(21); - $offset = 0; - $element = \Smalot\PdfParser\Element\ElementDate::parse("(D:20131206091846Z00'00')", null, $offset); - $element->setFormat('c'); - $this->assert->object($element->getContent())->isInstanceOf('\DateTime'); - //$this->assert->boolean($element->equals(new \DateTime('2013-09-01T23:55:55')))->isEqualTo(true); - $this->assert->integer($offset)->isEqualTo(25); - $offset = 0; - $element = \Smalot\PdfParser\Element\ElementDate::parse(" \n (D:1-23-2014, 19:02:15-03'00') ", null, $offset); - $element->setFormat('c'); - $this->assert->object($element->getContent())->isInstanceOf('\DateTime'); - $this->assert->castToString($element)->isEqualTo('2014-01-23T19:02:15-03:00'); - $this->assert->integer($offset)->isEqualTo(33); - - // Format invalid - $offset = 0; - $element = \Smalot\PdfParser\Element\ElementDate::parse(" \n (D:2013+02'00') ", null, $offset); - $this->assert->boolean($element)->isEqualTo(false); - $this->assert->integer($offset)->isEqualTo(0); - } - - public function testGetContent() - { - $element = new \Smalot\PdfParser\Element\ElementDate(new \DateTime('2013-09-01 23:55:55+02:00')); - $this->assert->dateTime($element->getContent())->isEqualTo(new \DateTime('2013-09-01 21:55:55+00:00')); - - try { - $element = new \Smalot\PdfParser\Element\ElementDate('2013-09-01 23:55:55+02:00'); - $this->assert->boolean(false)->isEqualTo(true); - } catch (\Exception $e) { - $this->assert->exception($e)->hasMessage('DateTime required.'); - } - } - - public function testEquals() - { - $element = new \Smalot\PdfParser\Element\ElementDate(new \DateTime('2013-09-01 23:55:55+02:00')); - $element->setFormat('c'); - $this->assert->boolean($element->equals('2013-09-01T23:55:55+02:00'))->isEqualTo(true); - $this->assert->boolean($element->equals('2013-09-01T23:55:55+01:00'))->isEqualTo(false); - $this->assert->boolean($element->equals(new \DateTime('2013-09-01T21:55:55+00:00')))->isEqualTo(true); - $this->assert->boolean($element->equals(new \DateTime('2013-09-01T23:55:55+01:00')))->isEqualTo(false); - $this->assert->boolean($element->equals('ABC'))->isEqualTo(false); - } - - public function testContains() - { - $element = new \Smalot\PdfParser\Element\ElementDate(new \DateTime('2013-09-01 23:55:55+02:00')); - $this->assert->boolean($element->contains('2013-09-01T21:55:55+00:00'))->isEqualTo(true); - $this->assert->boolean($element->contains('2013-06-15'))->isEqualTo(false); - } - - public function test__toString() - { - $element = new \Smalot\PdfParser\Element\ElementDate(new \DateTime('2013-09-01 23:55:55+02:00')); - $element->setFormat('c'); - $this->assert->castToString($element)->isEqualTo('2013-09-01T23:55:55+02:00'); - } -} diff --git a/src/Smalot/PdfParser/Tests/Units/Element/ElementHexa.php b/src/Smalot/PdfParser/Tests/Units/Element/ElementHexa.php deleted file mode 100644 index 56b39066..00000000 --- a/src/Smalot/PdfParser/Tests/Units/Element/ElementHexa.php +++ /dev/null @@ -1,106 +0,0 @@ - - * @date 2013-08-08 - * @license GPL-3.0 - * @url - * - * PdfParser is a pdf library written in PHP, extraction oriented. - * Copyright (C) 2014 - Sébastien MALOT - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. - * If not, see . - * - */ - -namespace Smalot\PdfParser\Tests\Units\Element; - -use mageekguy\atoum; - -/** - * Class ElementHexa - * - * @package Smalot\PdfParser\Tests\Units\Element - */ -class ElementHexa extends atoum\test -{ - public function testParse() - { - // Skipped. - $offset = 0; - $element = \Smalot\PdfParser\Element\ElementHexa::parse('ABC', null, $offset); - $this->assert->boolean($element)->isEqualTo(false); - $this->assert->integer($offset)->isEqualTo(0); - $offset = 0; - $element = \Smalot\PdfParser\Element\ElementHexa::parse(' [ <0020> 5 6 ]', null, $offset); - $this->assert->boolean($element)->isEqualTo(false); - $this->assert->integer($offset)->isEqualTo(0); - $offset = 0; - $element = \Smalot\PdfParser\Element\ElementHexa::parse(' << <0020> >>', null, $offset); - $this->assert->boolean($element)->isEqualTo(false); - $this->assert->integer($offset)->isEqualTo(0); - $offset = 0; - $element = \Smalot\PdfParser\Element\ElementHexa::parse(' / <0020> ', null, $offset); - $this->assert->boolean($element)->isEqualTo(false); - $this->assert->integer($offset)->isEqualTo(0); - $offset = 0; - $element = \Smalot\PdfParser\Element\ElementHexa::parse(' 0 <0020> ', null, $offset); - $this->assert->boolean($element)->isEqualTo(false); - $this->assert->integer($offset)->isEqualTo(0); - $offset = 0; - $element = \Smalot\PdfParser\Element\ElementHexa::parse(" 0 \n <0020> ", null, $offset); - $this->assert->boolean($element)->isEqualTo(false); - $this->assert->integer($offset)->isEqualTo(0); - - // Valid. - $offset = 0; - $element = \Smalot\PdfParser\Element\ElementHexa::parse(' <0020> ', null, $offset); - $this->assert->string($element->getContent())->isEqualTo(' '); - $this->assert->integer($offset)->isEqualTo(7); - $offset = 0; - $element = \Smalot\PdfParser\Element\ElementHexa::parse(' <0020> ', null, $offset); - $this->assert->string($element->getContent())->isEqualTo(' '); - $this->assert->integer($offset)->isEqualTo(7); - $offset = 0; - $element = \Smalot\PdfParser\Element\ElementHexa::parse(' <0020>', null, $offset); - $this->assert->string($element->getContent())->isEqualTo(' '); - $this->assert->integer($offset)->isEqualTo(7); - $offset = 0; - $element = \Smalot\PdfParser\Element\ElementHexa::parse('<0020>', null, $offset); - $this->assert->string($element->getContent())->isEqualTo(' '); - $this->assert->integer($offset)->isEqualTo(6); - $offset = 0; - $element = \Smalot\PdfParser\Element\ElementHexa::parse(" \n <0020> ", null, $offset); - $this->assert->string($element->getContent())->isEqualTo(' '); - $this->assert->integer($offset)->isEqualTo(9); - $offset = 0; - $element = \Smalot\PdfParser\Element\ElementHexa::parse(" \n <5465616d204d616e6167656d656e742053797374656d73> ", null, $offset); - $this->assert->string($element->getContent())->isEqualTo('Team Management Systems'); - $this->assert->integer($offset)->isEqualTo(51); - $offset = 0; - $element = \Smalot\PdfParser\Element\ElementHexa::parse(" \n <5265706f72744275696c646572> ", null, $offset); - $this->assert->object($element)->isInstanceOf('\Smalot\PdfParser\Element\ElementString'); - $this->assert->string($element->getContent())->isEqualTo('ReportBuilder'); - $this->assert->integer($offset)->isEqualTo(31); - $offset = 0; - $element = \Smalot\PdfParser\Element\ElementHexa::parse(" \n <443a3230313331323137313334303435303027303027> ", null, $offset); - $this->assert->object($element)->isInstanceOf('\Smalot\PdfParser\Element\ElementDate'); - $this->assert->castToString($element)->isEqualTo('2013-12-17T13:40:45+00:00'); - $this->assert->integer($offset)->isEqualTo(49); - } -} diff --git a/src/Smalot/PdfParser/Tests/Units/Element/ElementMissing.php b/src/Smalot/PdfParser/Tests/Units/Element/ElementMissing.php deleted file mode 100644 index 1d8c21c2..00000000 --- a/src/Smalot/PdfParser/Tests/Units/Element/ElementMissing.php +++ /dev/null @@ -1,71 +0,0 @@ - - * @date 2013-08-08 - * @license GPL-3.0 - * @url - * - * PdfParser is a pdf library written in PHP, extraction oriented. - * Copyright (C) 2014 - Sébastien MALOT - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. - * If not, see . - * - */ - -namespace Smalot\PdfParser\Tests\Units\Element; - -use mageekguy\atoum; - -/** - * Class ElementMissing - * - * @package Smalot\PdfParser\Tests\Units\Element - */ -class ElementMissing extends atoum\test -{ - public function testEquals() - { - $element = new \Smalot\PdfParser\Element\ElementMissing(null); - $this->assert->boolean($element->equals(null))->isEqualTo(false); - $this->assert->boolean($element->equals(true))->isEqualTo(false); - $this->assert->boolean($element->equals('A'))->isEqualTo(false); - $this->assert->boolean($element->equals(false))->isEqualTo(false); - } - - public function testGetContent() - { - $element = new \Smalot\PdfParser\Element\ElementMissing(null); - $this->assert->boolean($element->getContent())->isEqualTo(false); - } - - public function testContains() - { - $element = new \Smalot\PdfParser\Element\ElementMissing(null); - $this->assert->boolean($element->contains(null))->isEqualTo(false); - $this->assert->boolean($element->contains(true))->isEqualTo(false); - $this->assert->boolean($element->contains('A'))->isEqualTo(false); - $this->assert->boolean($element->contains(false))->isEqualTo(false); - } - - public function test__toString() - { - $element = new \Smalot\PdfParser\Element\ElementMissing(null); - $this->assert->castToString($element)->isEqualTo(''); - } -} diff --git a/src/Smalot/PdfParser/Tests/Units/Element/ElementName.php b/src/Smalot/PdfParser/Tests/Units/Element/ElementName.php deleted file mode 100644 index de90fc23..00000000 --- a/src/Smalot/PdfParser/Tests/Units/Element/ElementName.php +++ /dev/null @@ -1,157 +0,0 @@ - - * @date 2013-08-08 - * @license GPL-3.0 - * @url - * - * PdfParser is a pdf library written in PHP, extraction oriented. - * Copyright (C) 2014 - Sébastien MALOT - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. - * If not, see . - * - */ - -namespace Smalot\PdfParser\Tests\Units\Element; - -use mageekguy\atoum; - -/** - * Class ElementName - * - * @package Smalot\PdfParser\Tests\Units\Element - */ -class ElementName extends atoum\test -{ - public function testParse() - { - // Skipped. - $offset = 0; - $element = \Smalot\PdfParser\Element\ElementName::parse('ABC', null, $offset); - $this->assert->boolean($element)->isEqualTo(false); - $this->assert->integer($offset)->isEqualTo(0); - $offset = 0; - $element = \Smalot\PdfParser\Element\ElementName::parse(' [ /ABC 5 6 ]', null, $offset); - $this->assert->boolean($element)->isEqualTo(false); - $this->assert->integer($offset)->isEqualTo(0); - $offset = 0; - $element = \Smalot\PdfParser\Element\ElementName::parse(' << invalid >>', null, $offset); - $this->assert->boolean($element)->isEqualTo(false); - $this->assert->integer($offset)->isEqualTo(0); - $offset = 0; - $element = \Smalot\PdfParser\Element\ElementName::parse(' / FlateDecode ', null, $offset); - $this->assert->boolean($element)->isEqualTo(false); - $this->assert->integer($offset)->isEqualTo(0); - $offset = 0; - $element = \Smalot\PdfParser\Element\ElementName::parse(' 0 /FlateDecode ', null, $offset); - $this->assert->boolean($element)->isEqualTo(false); - $this->assert->integer($offset)->isEqualTo(0); - $offset = 0; - $element = \Smalot\PdfParser\Element\ElementName::parse(" 0 \n /FlateDecode ", null, $offset); - $this->assert->boolean($element)->isEqualTo(false); - $this->assert->integer($offset)->isEqualTo(0); - - // Valid. - $offset = 0; - $element = \Smalot\PdfParser\Element\ElementName::parse(' /FlateDecode ', null, $offset); - $this->assert->string($element->getContent())->isEqualTo('FlateDecode'); - $this->assert->integer($offset)->isEqualTo(13); - $offset = 0; - $element = \Smalot\PdfParser\Element\ElementName::parse(' /FlateDecode', null, $offset); - $this->assert->string($element->getContent())->isEqualTo('FlateDecode'); - $this->assert->integer($offset)->isEqualTo(13); - $offset = 0; - $element = \Smalot\PdfParser\Element\ElementName::parse('/FlateDecode', null, $offset); - $this->assert->string($element->getContent())->isEqualTo('FlateDecode'); - $this->assert->integer($offset)->isEqualTo(12); - $offset = 0; - $element = \Smalot\PdfParser\Element\ElementName::parse(" \n /FlateDecode ", null, $offset); - $this->assert->string($element->getContent())->isEqualTo('FlateDecode'); - $this->assert->integer($offset)->isEqualTo(15); - - $offset = 0; - $element = \Smalot\PdfParser\Element\ElementName::parse('/FlateDecode2', null, $offset); - $this->assert->string($element->getContent())->isEqualTo('FlateDecode2'); - $this->assert->integer($offset)->isEqualTo(13); - $offset = 0; - $element = \Smalot\PdfParser\Element\ElementName::parse('/Flate-Decode2', null, $offset); - $this->assert->string($element->getContent())->isEqualTo('Flate-Decode2'); - $this->assert->integer($offset)->isEqualTo(14); - $offset = 0; - $element = \Smalot\PdfParser\Element\ElementName::parse('/OJHCYD+Cambria', null, $offset); - $this->assert->string($element->getContent())->isEqualTo('OJHCYD+Cambria'); - $this->assert->integer($offset)->isEqualTo(15); - $offset = 0; - $element = \Smalot\PdfParser\Element\ElementName::parse('/OJHCYD+Cambria,Bold', null, $offset); - $this->assert->string($element->getContent())->isEqualTo('OJHCYD+Cambria,Bold'); - $this->assert->integer($offset)->isEqualTo(20); - - // - $offset = 0; - $element = \Smalot\PdfParser\Element\ElementName::parse('/Flate_Decode2', null, $offset); - $this->assert->string($element->getContent())->isEqualTo('Flate'); - $this->assert->integer($offset)->isEqualTo(6); - $offset = 0; - $element = \Smalot\PdfParser\Element\ElementName::parse('/Flate.Decode2', null, $offset); - $this->assert->string($element->getContent())->isEqualTo('Flate.Decode2'); - $this->assert->integer($offset)->isEqualTo(14); - } - - public function testGetContent() - { - $element = new \Smalot\PdfParser\Element\ElementName('FlateDecode'); - $this->assert->string($element->getContent())->isEqualTo('FlateDecode'); - } - - public function testEquals() - { - $element = new \Smalot\PdfParser\Element\ElementName('FlateDecode'); - $this->assert->boolean($element->equals('FlateDecode'))->isEqualTo(true); - $this->assert->boolean($element->equals('Flatedecode'))->isEqualTo(false); - - $element = new \Smalot\PdfParser\Element\ElementName('FlateDecode2'); - $this->assert->boolean($element->equals('FlateDecode2'))->isEqualTo(true); - $this->assert->boolean($element->equals('FlateDecode3'))->isEqualTo(false); - - $element = new \Smalot\PdfParser\Element\ElementName('Flate-Decode2'); - $this->assert->boolean($element->equals('Flate-Decode2'))->isEqualTo(true); - $this->assert->boolean($element->equals('Flate-Decode3'))->isEqualTo(false); - } - - public function testContains() - { - $element = new \Smalot\PdfParser\Element\ElementName('FlateDecode'); - $this->assert->boolean($element->contains('FlateDecode'))->isEqualTo(true); - $this->assert->boolean($element->contains('Flatedecode'))->isEqualTo(false); - - $element = new \Smalot\PdfParser\Element\ElementName('FlateDecode2'); - $this->assert->boolean($element->contains('FlateDecode2'))->isEqualTo(true); - $this->assert->boolean($element->contains('FlateDecode3'))->isEqualTo(false); - - $element = new \Smalot\PdfParser\Element\ElementName('Flate-Decode2'); - $this->assert->boolean($element->contains('Flate-Decode2'))->isEqualTo(true); - $this->assert->boolean($element->contains('Flate-Decode3'))->isEqualTo(false); - } - - public function test__toString() - { - $element = new \Smalot\PdfParser\Element\ElementName('FlateDecode'); - $this->assert->castToString($element)->isEqualTo('FlateDecode'); - } -} diff --git a/src/Smalot/PdfParser/Tests/Units/Element/ElementNull.php b/src/Smalot/PdfParser/Tests/Units/Element/ElementNull.php deleted file mode 100644 index 859b03c7..00000000 --- a/src/Smalot/PdfParser/Tests/Units/Element/ElementNull.php +++ /dev/null @@ -1,121 +0,0 @@ - - * @date 2013-08-08 - * @license GPL-3.0 - * @url - * - * PdfParser is a pdf library written in PHP, extraction oriented. - * Copyright (C) 2014 - Sébastien MALOT - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. - * If not, see . - * - */ - -namespace Smalot\PdfParser\Tests\Units\Element; - -use mageekguy\atoum; - -/** - * Class ElementNull - * - * @package Smalot\PdfParser\Tests\Units\Element - */ -class ElementNull extends atoum\test -{ - public function testParse() - { - // Skipped. - $offset = 0; - $element = \Smalot\PdfParser\Element\ElementNull::parse('ABC', null, $offset); - $this->assert->boolean($element)->isEqualTo(false); - $this->assert->integer($offset)->isEqualTo(0); - $offset = 0; - $element = \Smalot\PdfParser\Element\ElementNull::parse(' [ null ]', null, $offset); - $this->assert->boolean($element)->isEqualTo(false); - $this->assert->integer($offset)->isEqualTo(0); - $offset = 0; - $element = \Smalot\PdfParser\Element\ElementNull::parse(' << null >>', null, $offset); - $this->assert->boolean($element)->isEqualTo(false); - $this->assert->integer($offset)->isEqualTo(0); - $offset = 0; - $element = \Smalot\PdfParser\Element\ElementNull::parse(' / null ', null, $offset); - $this->assert->boolean($element)->isEqualTo(false); - $this->assert->integer($offset)->isEqualTo(0); - $offset = 0; - $element = \Smalot\PdfParser\Element\ElementNull::parse(' 0 null ', null, $offset); - $this->assert->boolean($element)->isEqualTo(false); - $this->assert->integer($offset)->isEqualTo(0); - $offset = 0; - $element = \Smalot\PdfParser\Element\ElementNull::parse(" 0 \n null ", null, $offset); - $this->assert->boolean($element)->isEqualTo(false); - $this->assert->integer($offset)->isEqualTo(0); - - // Valid. - $offset = 0; - $element = \Smalot\PdfParser\Element\ElementNull::parse(' null ', null, $offset); - $this->assert->boolean(is_null($element->getContent()))->isEqualTo(true); - $this->assert->integer($offset)->isEqualTo(5); - $offset = 0; - $element = \Smalot\PdfParser\Element\ElementNull::parse(' null ', null, $offset); - $this->assert->boolean(is_null($element->getContent()))->isEqualTo(true); - $this->assert->integer($offset)->isEqualTo(5); - $offset = 0; - $element = \Smalot\PdfParser\Element\ElementNull::parse(' null', null, $offset); - $this->assert->boolean(is_null($element->getContent()))->isEqualTo(true); - $this->assert->integer($offset)->isEqualTo(5); - $offset = 0; - $element = \Smalot\PdfParser\Element\ElementNull::parse('null', null, $offset); - $this->assert->boolean(is_null($element->getContent()))->isEqualTo(true); - $this->assert->integer($offset)->isEqualTo(4); - $offset = 0; - $element = \Smalot\PdfParser\Element\ElementNull::parse(" \n null ", null, $offset); - $this->assert->boolean(is_null($element->getContent()))->isEqualTo(true); - $this->assert->integer($offset)->isEqualTo(7); - } - - public function testGetContent() - { - $element = new \Smalot\PdfParser\Element\ElementNull('null'); - $this->assert->boolean(is_null($element->getContent()))->isEqualTo(true); - } - - public function testEquals() - { - $element = new \Smalot\PdfParser\Element\ElementNull('null'); - $this->assert->boolean($element->equals(null))->isEqualTo(true); - $this->assert->boolean($element->equals(false))->isEqualTo(false); - $this->assert->boolean($element->equals(0))->isEqualTo(false); - $this->assert->boolean($element->equals(1))->isEqualTo(false); - } - - public function testContains() - { - $element = new \Smalot\PdfParser\Element\ElementNull('null'); - $this->assert->boolean($element->contains(null))->isEqualTo(true); - $this->assert->boolean($element->contains(false))->isEqualTo(false); - $this->assert->boolean($element->contains(0))->isEqualTo(false); - } - - public function test__toString() - { - $element = new \Smalot\PdfParser\Element\ElementNull('null'); - $this->assert->castToString($element)->isEqualTo('null'); - } -} diff --git a/src/Smalot/PdfParser/Tests/Units/Element/ElementNumeric.php b/src/Smalot/PdfParser/Tests/Units/Element/ElementNumeric.php deleted file mode 100644 index 76dc68b9..00000000 --- a/src/Smalot/PdfParser/Tests/Units/Element/ElementNumeric.php +++ /dev/null @@ -1,184 +0,0 @@ - - * @date 2013-08-08 - * @license GPL-3.0 - * @url - * - * PdfParser is a pdf library written in PHP, extraction oriented. - * Copyright (C) 2014 - Sébastien MALOT - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. - * If not, see . - * - */ - -namespace Smalot\PdfParser\Tests\Units\Element; - -use mageekguy\atoum; - -/** - * Class ElementNumeric - * - * @package Smalot\PdfParser\Tests\Units\Element - */ -class ElementNumeric extends atoum\test -{ - public function testParse() - { - // Skipped. - $offset = 0; - $element = \Smalot\PdfParser\Element\ElementNumeric::parse('ABC', null, $offset); - $this->assert->boolean($element)->isEqualTo(false); - $this->assert->integer($offset)->isEqualTo(0); - $offset = 0; - $element = \Smalot\PdfParser\Element\ElementNumeric::parse(' [ 2 ]', null, $offset); - $this->assert->boolean($element)->isEqualTo(false); - $this->assert->integer($offset)->isEqualTo(0); - $offset = 0; - $element = \Smalot\PdfParser\Element\ElementNumeric::parse(' /2', null, $offset); - $this->assert->boolean($element)->isEqualTo(false); - $this->assert->integer($offset)->isEqualTo(0); - $offset = 0; - $element = \Smalot\PdfParser\Element\ElementNumeric::parse(" /2 \n 2", null, $offset); - $this->assert->boolean($element)->isEqualTo(false); - $this->assert->integer($offset)->isEqualTo(0); - - // Valid. - $offset = 0; - $element = \Smalot\PdfParser\Element\ElementNumeric::parse(' -2', null, $offset); - $this->assert->float($element->getContent())->isEqualTo(-2.0); - $this->assert->integer($offset)->isEqualTo(3); - $offset = 0; - $element = \Smalot\PdfParser\Element\ElementNumeric::parse('2BC', null, $offset); - $this->assert->float($element->getContent())->isEqualTo(2.0); - $this->assert->integer($offset)->isEqualTo(1); - $offset = 0; - $element = \Smalot\PdfParser\Element\ElementNumeric::parse(' 2BC', null, $offset); - $this->assert->float($element->getContent())->isEqualTo(2.0); - $this->assert->integer($offset)->isEqualTo(2); - $offset = 0; - $element = \Smalot\PdfParser\Element\ElementNumeric::parse(' -2BC', null, $offset); - $this->assert->float($element->getContent())->isEqualTo(-2.0); - $this->assert->integer($offset)->isEqualTo(3); - $offset = 0; - $element = \Smalot\PdfParser\Element\ElementNumeric::parse(' -2', null, $offset); - $this->assert->float($element->getContent())->isEqualTo(-2.0); - $this->assert->integer($offset)->isEqualTo(3); - $offset = 0; - $element = \Smalot\PdfParser\Element\ElementNumeric::parse(' 2 0 obj', null, $offset); - $this->assert->float($element->getContent())->isEqualTo(2.0); - $this->assert->integer($offset)->isEqualTo(2); - $offset = 0; - $element = \Smalot\PdfParser\Element\ElementNumeric::parse(" \n -2 ", null, $offset); - $this->assert->float($element->getContent())->isEqualTo(-2.0); - $this->assert->integer($offset)->isEqualTo(5); - } - - public function testGetContent() - { - $element = new \Smalot\PdfParser\Element\ElementNumeric('B'); - $this->assert->float($element->getContent())->isEqualTo(0.0); - $element = new \Smalot\PdfParser\Element\ElementNumeric('-2.5'); - $this->assert->float($element->getContent())->isEqualTo(-2.5); - $element = new \Smalot\PdfParser\Element\ElementNumeric('-2'); - $this->assert->float($element->getContent())->isEqualTo(-2.0); - $element = new \Smalot\PdfParser\Element\ElementNumeric(' -2'); - $this->assert->float($element->getContent())->isEqualTo(-2.0); - $element = new \Smalot\PdfParser\Element\ElementNumeric('2.5'); - $this->assert->float($element->getContent())->isEqualTo(2.5); - $element = new \Smalot\PdfParser\Element\ElementNumeric('2'); - $this->assert->float($element->getContent())->isEqualTo(2.0); - } - - public function testEquals() - { - $element = new \Smalot\PdfParser\Element\ElementNumeric('1'); - $this->assert->boolean($element->equals('B'))->isEqualTo(false); - $element = new \Smalot\PdfParser\Element\ElementNumeric('1.5'); - $this->assert->boolean($element->equals('B'))->isEqualTo(false); - - $element = new \Smalot\PdfParser\Element\ElementNumeric('2'); - $this->assert->boolean($element->equals('2'))->isEqualTo(true); - $element = new \Smalot\PdfParser\Element\ElementNumeric('2'); - $this->assert->boolean($element->equals('3'))->isEqualTo(false); - - $element = new \Smalot\PdfParser\Element\ElementNumeric('-2'); - $this->assert->boolean($element->equals('-2'))->isEqualTo(true); - $element = new \Smalot\PdfParser\Element\ElementNumeric('-2'); - $this->assert->boolean($element->equals('-3'))->isEqualTo(false); - - $element = new \Smalot\PdfParser\Element\ElementNumeric('2.5'); - $this->assert->boolean($element->equals('2.5'))->isEqualTo(true); - $element = new \Smalot\PdfParser\Element\ElementNumeric('2.5'); - $this->assert->boolean($element->equals('3.5'))->isEqualTo(false); - - $element = new \Smalot\PdfParser\Element\ElementNumeric('-2.5'); - $this->assert->boolean($element->equals('-2.5'))->isEqualTo(true); - $element = new \Smalot\PdfParser\Element\ElementNumeric('-2.5'); - $this->assert->boolean($element->equals('-3.5'))->isEqualTo(false); - } - - public function testContains() - { - $element = new \Smalot\PdfParser\Element\ElementNumeric('1'); - $this->assert->boolean($element->contains('B'))->isEqualTo(false); - $element = new \Smalot\PdfParser\Element\ElementNumeric('1.5'); - $this->assert->boolean($element->contains('B'))->isEqualTo(false); - - $element = new \Smalot\PdfParser\Element\ElementNumeric('2'); - $this->assert->boolean($element->contains('2'))->isEqualTo(true); - $element = new \Smalot\PdfParser\Element\ElementNumeric('2'); - $this->assert->boolean($element->contains('3'))->isEqualTo(false); - - $element = new \Smalot\PdfParser\Element\ElementNumeric('-2'); - $this->assert->boolean($element->contains('-2'))->isEqualTo(true); - $element = new \Smalot\PdfParser\Element\ElementNumeric('-2'); - $this->assert->boolean($element->contains('-3'))->isEqualTo(false); - - $element = new \Smalot\PdfParser\Element\ElementNumeric('2.5'); - $this->assert->boolean($element->contains('2.5'))->isEqualTo(true); - $element = new \Smalot\PdfParser\Element\ElementNumeric('2.5'); - $this->assert->boolean($element->contains('3.5'))->isEqualTo(false); - - $element = new \Smalot\PdfParser\Element\ElementNumeric('-2.5'); - $this->assert->boolean($element->contains('-2.5'))->isEqualTo(true); - $element = new \Smalot\PdfParser\Element\ElementNumeric('-2.5'); - $this->assert->boolean($element->contains('-3.5'))->isEqualTo(false); - } - - public function test__toString() - { - $element = new \Smalot\PdfParser\Element\ElementNumeric('B'); - $this->assert->castToString($element)->isEqualTo('0'); - $element = new \Smalot\PdfParser\Element\ElementNumeric('1B'); - $this->assert->castToString($element)->isEqualTo('1'); - - $element = new \Smalot\PdfParser\Element\ElementNumeric('2'); - $this->assert->castToString($element)->isEqualTo('2'); - - $element = new \Smalot\PdfParser\Element\ElementNumeric('-2'); - $this->assert->castToString($element)->isEqualTo('-2'); - - $element = new \Smalot\PdfParser\Element\ElementNumeric('2.5'); - $this->assert->castToString($element)->isEqualTo('2.5'); - - $element = new \Smalot\PdfParser\Element\ElementNumeric('-2.5'); - $this->assert->castToString($element)->isEqualTo('-2.5'); - } -} diff --git a/src/Smalot/PdfParser/Tests/Units/Element/ElementString.php b/src/Smalot/PdfParser/Tests/Units/Element/ElementString.php deleted file mode 100644 index 99868887..00000000 --- a/src/Smalot/PdfParser/Tests/Units/Element/ElementString.php +++ /dev/null @@ -1,156 +0,0 @@ - - * @date 2013-08-08 - * @license GPL-3.0 - * @url - * - * PdfParser is a pdf library written in PHP, extraction oriented. - * Copyright (C) 2014 - Sébastien MALOT - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. - * If not, see . - * - */ - -namespace Smalot\PdfParser\Tests\Units\Element; - -use mageekguy\atoum; - -/** - * Class ElementString - * - * @package Smalot\PdfParser\Tests\Units\Element - */ -class ElementString extends atoum\test -{ - public function testParse() - { - // Skipped. - $offset = 0; - $element = \Smalot\PdfParser\Element\ElementString::parse('ABC', null, $offset); - $this->assert->boolean($element)->isEqualTo(false); - $this->assert->integer($offset)->isEqualTo(0); - $offset = 0; - $element = \Smalot\PdfParser\Element\ElementString::parse(' [ (ABC) 5 6 ]', null, $offset); - $this->assert->boolean($element)->isEqualTo(false); - $this->assert->integer($offset)->isEqualTo(0); - $offset = 0; - $element = \Smalot\PdfParser\Element\ElementString::parse(' << (invalid) >>', null, $offset); - $this->assert->boolean($element)->isEqualTo(false); - $this->assert->integer($offset)->isEqualTo(0); - $offset = 0; - $element = \Smalot\PdfParser\Element\ElementString::parse(' / (FlateDecode) ', null, $offset); - $this->assert->boolean($element)->isEqualTo(false); - $this->assert->integer($offset)->isEqualTo(0); - $offset = 0; - $element = \Smalot\PdfParser\Element\ElementString::parse(' 0 (FlateDecode) ', null, $offset); - $this->assert->boolean($element)->isEqualTo(false); - $this->assert->integer($offset)->isEqualTo(0); - $offset = 0; - $element = \Smalot\PdfParser\Element\ElementString::parse(" 0 \n (FlateDecode) ", null, $offset); - $this->assert->boolean($element)->isEqualTo(false); - $this->assert->integer($offset)->isEqualTo(0); - - // Valid. - $offset = 0; - $element = \Smalot\PdfParser\Element\ElementString::parse(' (Copyright) ', null, $offset); - $this->assert->string($element->getContent())->isEqualTo('Copyright'); - $this->assert->integer($offset)->isEqualTo(12); - $offset = 0; - $element = \Smalot\PdfParser\Element\ElementString::parse(' (Copyright) ', null, $offset); - $this->assert->string($element->getContent())->isEqualTo('Copyright'); - $this->assert->integer($offset)->isEqualTo(12); - $offset = 0; - $element = \Smalot\PdfParser\Element\ElementString::parse(' (Copyright)', null, $offset); - $this->assert->string($element->getContent())->isEqualTo('Copyright'); - $this->assert->integer($offset)->isEqualTo(12); - $offset = 0; - $element = \Smalot\PdfParser\Element\ElementString::parse('(Copyright)', null, $offset); - $this->assert->string($element->getContent())->isEqualTo('Copyright'); - $this->assert->integer($offset)->isEqualTo(11); - $offset = 0; - $element = \Smalot\PdfParser\Element\ElementString::parse('(Copy-right2)', null, $offset); - $this->assert->string($element->getContent())->isEqualTo('Copy-right2'); - $this->assert->integer($offset)->isEqualTo(13); - $offset = 0; - $element = \Smalot\PdfParser\Element\ElementString::parse(" \n (Copyright) ", null, $offset); - $this->assert->string($element->getContent())->isEqualTo('Copyright'); - $this->assert->integer($offset)->isEqualTo(14); - $offset = 0; - $element = \Smalot\PdfParser\Element\ElementString::parse('()', null, $offset); - $this->assert->string($element->getContent())->isEqualTo(''); - $this->assert->integer($offset)->isEqualTo(2); - - // Complex study case : Unicode + octal. - $offset = 0; - $element = \Smalot\PdfParser\Element\ElementString::parse("(ABC\\))", null, $offset); - $this->assert->string($element->getContent())->isEqualTo('ABC)'); - $this->assert->integer($offset)->isEqualTo(7); - $offset = 0; - $element = \Smalot\PdfParser\Element\ElementString::parse("(\xFE\xFF\\000M)", null, $offset); - $this->assert->string($element->getContent())->isEqualTo('M'); - $this->assert->integer($offset)->isEqualTo(9); - $offset = 0; - $element = \Smalot\PdfParser\Element\ElementString::parse("(<20>)", null, $offset); - $this->assert->string($element->getContent())->isEqualTo(' '); - $this->assert->integer($offset)->isEqualTo(6); - $offset = 0; - $element = \Smalot\PdfParser\Element\ElementString::parse("(Gutter\\ console\\ assembly)", null, $offset); - $this->assert->string($element->getContent())->isEqualTo('Gutter console assembly'); - $this->assert->integer($offset)->isEqualTo(27); - } - - public function testGetContent() - { - $element = new \Smalot\PdfParser\Element\ElementString('Copyright'); - $this->assert->string($element->getContent())->isEqualTo('Copyright'); - } - - public function testEquals() - { - $element = new \Smalot\PdfParser\Element\ElementString('CopyRight'); - $this->assert->boolean($element->equals('CopyRight'))->isEqualTo(true); - $this->assert->boolean($element->equals('Flatedecode'))->isEqualTo(false); - - $element = new \Smalot\PdfParser\Element\ElementString('CopyRight2'); - $this->assert->boolean($element->equals('CopyRight2'))->isEqualTo(true); - $this->assert->boolean($element->equals('CopyRight3'))->isEqualTo(false); - - $element = new \Smalot\PdfParser\Element\ElementString('Flate-Decode2'); - $this->assert->boolean($element->equals('Flate-Decode2'))->isEqualTo(true); - $this->assert->boolean($element->equals('Flate-Decode3'))->isEqualTo(false); - } - - public function testContains() - { - $element = new \Smalot\PdfParser\Element\ElementString('CopyRight'); - $this->assert->boolean($element->contains('CopyRight'))->isEqualTo(true); - $this->assert->boolean($element->contains('Copyright'))->isEqualTo(false); - - $element = new \Smalot\PdfParser\Element\ElementString('CopyRight2'); - $this->assert->boolean($element->contains('CopyRight2'))->isEqualTo(true); - $this->assert->boolean($element->contains('CopyRight3'))->isEqualTo(false); - } - - public function test__toString() - { - $element = new \Smalot\PdfParser\Element\ElementString('CopyRight'); - $this->assert->castToString($element)->isEqualTo('CopyRight'); - } -} diff --git a/src/Smalot/PdfParser/Tests/Units/Element/ElementStruct.php b/src/Smalot/PdfParser/Tests/Units/Element/ElementStruct.php deleted file mode 100644 index 6e6aaa9f..00000000 --- a/src/Smalot/PdfParser/Tests/Units/Element/ElementStruct.php +++ /dev/null @@ -1,98 +0,0 @@ - - * @date 2013-08-08 - * @license GPL-3.0 - * @url - * - * PdfParser is a pdf library written in PHP, extraction oriented. - * Copyright (C) 2014 - Sébastien MALOT - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. - * If not, see . - * - */ - -namespace Smalot\PdfParser\Tests\Units\Element; - -use mageekguy\atoum; - -/** - * Class ElementStruct - * - * @package Smalot\PdfParser\Tests\Units\Element - */ -class ElementStruct extends atoum\test -{ - public function testParse() - { - $document = new \Smalot\PdfParser\Document(array()); - - // Skipped. - $offset = 0; - $element = \Smalot\PdfParser\Element\ElementStruct::parse('ABC', $document, $offset); - $this->assert->boolean($element)->isEqualTo(false); - $this->assert->integer($offset)->isEqualTo(0); - $offset = 0; - $element = \Smalot\PdfParser\Element\ElementStruct::parse( - ' [ << /Filter /FlateDecode >> ]', - $document, - $offset - ); - $this->assert->boolean($element)->isEqualTo(false); - $this->assert->integer($offset)->isEqualTo(0); - $offset = 0; - $element = \Smalot\PdfParser\Element\ElementStruct::parse(' / << /Filter /FlateDecode >> ', $document, $offset); - $this->assert->boolean($element)->isEqualTo(false); - $this->assert->integer($offset)->isEqualTo(0); - $offset = 0; - $element = \Smalot\PdfParser\Element\ElementStruct::parse(' 0 << /Filter /FlateDecode >> ', $document, $offset); - $this->assert->boolean($element)->isEqualTo(false); - $this->assert->integer($offset)->isEqualTo(0); - $offset = 0; - $element = \Smalot\PdfParser\Element\ElementStruct::parse( - " 0 \n << /Filter /FlateDecode >> ", - $document, - $offset - ); - $this->assert->boolean($element)->isEqualTo(false); - $this->assert->integer($offset)->isEqualTo(0); - - // Valid. - $offset = 0; - $element = \Smalot\PdfParser\Element\ElementStruct::parse(' << /Filter /FlateDecode >> ', $document, $offset); - $this->assert->object($element)->isInstanceOf('\Smalot\PdfParser\Header'); - $this->assert->integer($offset)->isEqualTo(27); - $offset = 0; - $element = \Smalot\PdfParser\Element\ElementStruct::parse(' << /Filter /FlateDecode >>', $document, $offset); - $this->assert->object($element)->isInstanceOf('\Smalot\PdfParser\Header'); - $this->assert->integer($offset)->isEqualTo(27); - $offset = 0; - $element = \Smalot\PdfParser\Element\ElementStruct::parse('<< /Filter /FlateDecode >>', $document, $offset); - $this->assert->object($element)->isInstanceOf('\Smalot\PdfParser\Header'); - $this->assert->integer($offset)->isEqualTo(26); - $offset = 0; - $element = \Smalot\PdfParser\Element\ElementStruct::parse( - " \n << /Filter /FlateDecode >> ", - $document, - $offset - ); - $this->assert->object($element)->isInstanceOf('\Smalot\PdfParser\Header'); - $this->assert->integer($offset)->isEqualTo(29); - } -} diff --git a/src/Smalot/PdfParser/Tests/Units/Element/ElementXRef.php b/src/Smalot/PdfParser/Tests/Units/Element/ElementXRef.php deleted file mode 100644 index 4ff27bc6..00000000 --- a/src/Smalot/PdfParser/Tests/Units/Element/ElementXRef.php +++ /dev/null @@ -1,126 +0,0 @@ - - * @date 2013-08-08 - * @license GPL-3.0 - * @url - * - * PdfParser is a pdf library written in PHP, extraction oriented. - * Copyright (C) 2014 - Sébastien MALOT - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. - * If not, see . - * - */ - -namespace Smalot\PdfParser\Tests\Units\Element; - -use mageekguy\atoum; - -/** - * Class ElementXRef - * - * @package Smalot\PdfParser\Tests\Units\Element - */ -class ElementXRef extends atoum\test -{ - public function testParse() - { - // Skipped. - $offset = 0; - $element = \Smalot\PdfParser\Element\ElementXRef::parse('ABC', null, $offset); - $this->assert->boolean($element)->isEqualTo(false); - $this->assert->integer($offset)->isEqualTo(0); - $offset = 0; - $element = \Smalot\PdfParser\Element\ElementXRef::parse(' [ 5 0 R ]', null, $offset); - $this->assert->boolean($element)->isEqualTo(false); - $this->assert->integer($offset)->isEqualTo(0); - $offset = 0; - $element = \Smalot\PdfParser\Element\ElementXRef::parse(' << 5 0 R >>', null, $offset); - $this->assert->boolean($element)->isEqualTo(false); - $this->assert->integer($offset)->isEqualTo(0); - $offset = 0; - $element = \Smalot\PdfParser\Element\ElementXRef::parse(' / 5 0 R ', null, $offset); - $this->assert->boolean($element)->isEqualTo(false); - $this->assert->integer($offset)->isEqualTo(0); - $offset = 0; - $element = \Smalot\PdfParser\Element\ElementXRef::parse(' 0 5 0 R ', null, $offset); - $this->assert->boolean($element)->isEqualTo(false); - $this->assert->integer($offset)->isEqualTo(0); - $offset = 0; - $element = \Smalot\PdfParser\Element\ElementXRef::parse(" 0 \n 5 0 R ", null, $offset); - $this->assert->boolean($element)->isEqualTo(false); - $this->assert->integer($offset)->isEqualTo(0); - - // Valid. - $offset = 0; - $element = \Smalot\PdfParser\Element\ElementXRef::parse(' 5 0 R ', null, $offset); - $this->assert->string($element->getContent())->isEqualTo('5_0'); - $this->assert->integer($offset)->isEqualTo(6); - $offset = 0; - $element = \Smalot\PdfParser\Element\ElementXRef::parse(' 5 0 R ', null, $offset); - $this->assert->string($element->getContent())->isEqualTo('5_0'); - $this->assert->integer($offset)->isEqualTo(6); - $offset = 0; - $element = \Smalot\PdfParser\Element\ElementXRef::parse(' 5 0 R', null, $offset); - $this->assert->string($element->getContent())->isEqualTo('5_0'); - $this->assert->integer($offset)->isEqualTo(6); - $offset = 0; - $element = \Smalot\PdfParser\Element\ElementXRef::parse('5 0 R', null, $offset); - $this->assert->string($element->getContent())->isEqualTo('5_0'); - $this->assert->integer($offset)->isEqualTo(5); - $offset = 0; - $element = \Smalot\PdfParser\Element\ElementXRef::parse(" \n 5 0 R ", null, $offset); - $this->assert->string($element->getContent())->isEqualTo('5_0'); - $this->assert->integer($offset)->isEqualTo(8); - } - - public function testGetContent() - { - $element = new \Smalot\PdfParser\Element\ElementXRef('5_0'); - $this->assert->string($element->getContent())->isEqualTo('5_0'); - } - - public function testGetId() - { - $element = new \Smalot\PdfParser\Element\ElementXRef('5_0'); - $this->assert->string($element->getId())->isEqualTo('5_0'); - } - - public function testEquals() - { - $element = new \Smalot\PdfParser\Element\ElementXRef('5_0'); - $this->assert->boolean($element->equals(5))->isEqualTo(true); - $this->assert->boolean($element->equals(8))->isEqualTo(false); - $this->assert->boolean($element->equals($element))->isEqualTo(true); - } - - public function testContains() - { - $element = new \Smalot\PdfParser\Element\ElementXRef('5_0'); - $this->assert->boolean($element->contains(5))->isEqualTo(true); - $this->assert->boolean($element->contains(8))->isEqualTo(false); - $this->assert->boolean($element->contains($element))->isEqualTo(true); - } - - public function test__toString() - { - $element = new \Smalot\PdfParser\Element\ElementXRef('5_0'); - $this->assert->castToString($element)->isEqualTo('#Obj#5_0'); - } -} diff --git a/src/Smalot/PdfParser/Tests/Units/Font.php b/src/Smalot/PdfParser/Tests/Units/Font.php deleted file mode 100644 index 24b60db1..00000000 --- a/src/Smalot/PdfParser/Tests/Units/Font.php +++ /dev/null @@ -1,299 +0,0 @@ - - * @date 2013-08-08 - * @license GPL-3.0 - * @url - * - * PdfParser is a pdf library written in PHP, extraction oriented. - * Copyright (C) 2014 - Sébastien MALOT - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. - * If not, see . - * - */ - -namespace Smalot\PdfParser\Tests\Units; - -use mageekguy\atoum; -use Smalot\PdfParser\Header; - -/** - * Class Font - * - * @package Smalot\PdfParser\Tests\Units - */ -class Font extends atoum\test -{ - public function testGetName() - { - $filename = __DIR__ . '/../../../../../samples/Document1_pdfcreator_nocompressed.pdf'; - $parser = new \Smalot\PdfParser\Parser(); - $document = $parser->parseFile($filename); - $fonts = $document->getFonts(); - $font = reset($fonts); - - $this->assert->string($font->getName())->isEqualTo('OJHCYD+Cambria,Bold'); - } - - public function testGetType() - { - $filename = __DIR__ . '/../../../../../samples/Document1_pdfcreator_nocompressed.pdf'; - $parser = new \Smalot\PdfParser\Parser(); - $document = $parser->parseFile($filename); - $fonts = $document->getFonts(); - $font = reset($fonts); - - $this->assert->string($font->getType())->isEqualTo('TrueType'); - } - - public function testGetDetails() - { - $filename = __DIR__ . '/../../../../../samples/Document1_pdfcreator_nocompressed.pdf'; - $parser = new \Smalot\PdfParser\Parser(); - $document = $parser->parseFile($filename); - $fonts = $document->getFonts(); - $font = reset($fonts); - $reference = array( - 'Name' => 'OJHCYD+Cambria,Bold', - 'Type' => 'TrueType', - 'Encoding' => 'Ansi', - 'BaseFont' => 'OJHCYD+Cambria,Bold', - 'FontDescriptor' => - array( - 'Type' => 'FontDescriptor', - 'FontName' => 'OJHCYD+Cambria,Bold', - 'Flags' => 4, - 'Ascent' => 699, - 'CapHeight' => 699, - 'Descent' => -7, - 'ItalicAngle' => 0, - 'StemV' => 128, - 'MissingWidth' => 658, - ), - 'ToUnicode' => - array( - 'Filter' => 'FlateDecode', - 'Length' => 219, - ), - 'FirstChar' => 1, - 'LastChar' => 11, - 'Widths' => - array( - 0 => 705, - 1 => 569, - 2 => 469, - 3 => 597, - 4 => 890, - 5 => 531, - 6 => 604, - 7 => 365, - 8 => 220, - 9 => 314, - 10 => 308, - ), - 'Subtype' => 'TrueType', - ); - $this->assert->array($font->getDetails())->isEqualTo($reference); - } - - public function testTranslateChar() - { - $filename = __DIR__ . '/../../../../../samples/Document1_pdfcreator_nocompressed.pdf'; - $parser = new \Smalot\PdfParser\Parser(); - $document = $parser->parseFile($filename); - $fonts = $document->getFonts(); - /** @var \Smalot\PdfParser\Font $font */ - $font = reset($fonts); - - $this->assert->string($font->translateChar("\x01"))->isEqualTo('D'); - $this->assert->string($font->translateChar("\x02"))->isEqualTo('o'); - $this->assert->string($font->translateChar("\x03"))->isEqualTo('c'); - $this->assert->string($font->translateChar("\x04"))->isEqualTo('u'); - $this->assert->string($font->translateChar("\x99"))->isEqualTo(\Smalot\PdfParser\Font::MISSING); - } - - public function testLoadTranslateTable() - { - $document = new \Smalot\PdfParser\Document(); - - $content = '<>'; - $header = Header::parse($content, $document); - $font = new \Smalot\PdfParser\Font($document, $header); - - $content = '/CIDInit /ProcSet findresource begin -14 dict begin -begincmap -/CIDSystemInfo -<< /Registry (Adobe) -/Ordering (UCS) -/Supplement 0 ->> def -/CMapName /Adobe-Identity-UCS def -/CMapType 2 def -1 begincodespacerange -<0000> -endcodespacerange -3 beginbfchar -<0003> <0020> -<000F> <002C> -<0011> <002E> -endbfchar -2 beginbfrange -<0013> <0016> <0030> -<0018> <001C> <0035> -endbfrange -7 beginbfchar -<0023> <0040> -<0026> <0043> -<0028> <0045> -<0030> <004D> -<0033> <0050> -<0035> <0052> -<0039> <0056> -endbfchar -4 beginbfrange -<0044> <004C> <0061> -<004F> <0052> <006C> -<0054> <0059> <0071> -<005B> <005C> <0078> -endbfrange -4 beginbfchar -<0070> <00E9> -<00AB> <2026> -<00B0> <0153> -<00B6> <2019> -endbfchar -1 beginbfrange -<0084> <0086> [<0061> <0071> <0081>] -endbfrange -endcmap -CMapName currentdict /CMap defineresource pop -end -end'; - $unicode = new \Smalot\PdfParser\Object($document, null, $content); - - $document->setObjects(array('1_0' => $font, '2_0' => $unicode)); - - $font->init(); - // Test reload - $table = $font->loadTranslateTable(); - - $this->assert->array($table)->hasSize(47); - - // Test chars - $this->assert->string($table[3])->isEqualTo(' '); - $this->assert->string($table[15])->isEqualTo(','); - $this->assert->string($table[17])->isEqualTo('.'); - $this->assert->string($table[35])->isEqualTo('@'); - $this->assert->string($table[57])->isEqualTo('V'); - - // Test ranges - $this->assert->string($table[85])->isEqualTo('r'); - $this->assert->string($table[92])->isEqualTo('y'); - } - - public function testDecodeHexadecimal() - { - $hexa = '<322041>'; - $this->assert->string(\Smalot\PdfParser\Font::decodeHexadecimal($hexa))->isEqualTo("2 A"); - $this->assert->string(\Smalot\PdfParser\Font::decodeHexadecimal($hexa, false))->isEqualTo("2 A"); - $this->assert->string(\Smalot\PdfParser\Font::decodeHexadecimal($hexa, true))->isEqualTo("(2 A)"); - - $hexa = '<003200200041>'; - $this->assert->string(\Smalot\PdfParser\Font::decodeHexadecimal($hexa))->isEqualTo("\x002\x00 \x00A"); - $this->assert->string(\Smalot\PdfParser\Font::decodeHexadecimal($hexa, false))->isEqualTo("\x002\x00 \x00A"); - $this->assert->string(\Smalot\PdfParser\Font::decodeHexadecimal($hexa, true))->isEqualTo("(\x002\x00 \x00A)"); - - $hexa = '<00320020> 8 <0041>'; - $this->assert->string(\Smalot\PdfParser\Font::decodeHexadecimal($hexa))->isEqualTo("\x002\x00 8 \x00A"); - $this->assert->string(\Smalot\PdfParser\Font::decodeHexadecimal($hexa, false))->isEqualTo("\x002\x00 8 \x00A"); - $this->assert->string(\Smalot\PdfParser\Font::decodeHexadecimal($hexa, true))->isEqualTo( - "(\x002\x00 ) 8 (\x00A)" - ); - - $hexa = '<3220> 8 <41>'; - $this->assert->string(\Smalot\PdfParser\Font::decodeHexadecimal($hexa))->isEqualTo("2 8 A"); - $this->assert->string(\Smalot\PdfParser\Font::decodeHexadecimal($hexa, false))->isEqualTo("2 8 A"); - $this->assert->string(\Smalot\PdfParser\Font::decodeHexadecimal($hexa, true))->isEqualTo("(2 ) 8 (A)"); - - $hexa = '<00320020005C>-10<0041>'; - $this->assert->string(\Smalot\PdfParser\Font::decodeHexadecimal($hexa))->isEqualTo("\x002\x00 \x00\\-10\x00A"); - $this->assert->string(\Smalot\PdfParser\Font::decodeHexadecimal($hexa, false))->isEqualTo( - "\x002\x00 \x00\\-10\x00A" - ); - $this->assert->string(\Smalot\PdfParser\Font::decodeHexadecimal($hexa, true))->isEqualTo( - "(\x002\x00 \x00\\\\)-10(\x00A)" - ); - } - - public function testDecodeOctal() - { - $this->assert->string(\Smalot\PdfParser\Font::decodeOctal("\\101\\102\\040\\103"))->isEqualTo('AB C'); - $this->assert->string(\Smalot\PdfParser\Font::decodeOctal("\\101\\102\\040\\103D"))->isEqualTo('AB CD'); - } - - public function testDecodeEntities() - { - $this->assert->string(\Smalot\PdfParser\Font::decodeEntities("File#20Type"))->isEqualTo('File Type'); - $this->assert->string(\Smalot\PdfParser\Font::decodeEntities("File##20Ty#pe"))->isEqualTo('File# Ty#pe'); - } - - public function testDecodeUnicode() - { - $this->assert->string(\Smalot\PdfParser\Font::decodeUnicode("\xFE\xFF\x00A\x00B"))->isEqualTo('AB'); - } - - public function testDecodeText() - { - $filename = __DIR__ . '/../../../../../samples/Document1_pdfcreator_nocompressed.pdf'; - $parser = new \Smalot\PdfParser\Parser(); - $document = $parser->parseFile($filename); - $fonts = $document->getFonts(); - /** @var \Smalot\PdfParser\Font $font */ - // Cambria - $font = reset($fonts); - $commands = array( - array( - 't' => '', - 'c' => "\x01\x02", - ), - array( - 't' => 'n', - 'c' => -10, - ), - array( - 't' => '', - 'c' => "\x03", - ), - array( - 't' => '', - 'c' => "\x04", - ), - array( - 't' => 'n', - 'c' => -100, - ), - array( - 't' => '<', - 'c' => "01020304", - ), - ); - $this->assert->string($font->decodeText($commands))->isEqualTo('Docu Docu'); - } -} diff --git a/src/Smalot/PdfParser/Tests/Units/Header.php b/src/Smalot/PdfParser/Tests/Units/Header.php deleted file mode 100644 index 7d7e8f4a..00000000 --- a/src/Smalot/PdfParser/Tests/Units/Header.php +++ /dev/null @@ -1,151 +0,0 @@ - - * @date 2013-08-08 - * @license GPL-3.0 - * @url - * - * PdfParser is a pdf library written in PHP, extraction oriented. - * Copyright (C) 2014 - Sébastien MALOT - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. - * If not, see . - * - */ - -namespace Smalot\PdfParser\Tests\Units; - -use mageekguy\atoum; - -/** - * Class Header - * - * @package Smalot\PdfParser\Tests\Units - */ -class Header extends atoum\test -{ - public function testParse() - { - $document = new \Smalot\PdfParser\Document(); - - $content = '<>foo'; - $position = 0; - $header = \Smalot\PdfParser\Header::parse($content, $document, $position); - - $this->assert->object($header)->isInstanceOf('\Smalot\PdfParser\Header'); - $this->assert->integer($position)->isEqualTo(27); - $this->assert->array($header->getElements())->hasSize(2); - - // No header to parse - $this->assert->castToString($header->get('Type'))->isEqualTo('Page'); - $content = 'foo'; - $position = 0; - $header = \Smalot\PdfParser\Header::parse($content, $document, $position); - - $this->assert->object($header)->isInstanceOf('\Smalot\PdfParser\Header'); - $this->assert->integer($position)->isEqualTo(0); - $this->assert->array($header->getElements())->hasSize(0); - - $position = 0; - $content = "<>"; - $header = \Smalot\PdfParser\Header::parse($content, $document, $position); - $this->assert->integer($position)->isEqualTo(212); - - $position = 0; - $content = '[5 0 R ] foo'; - $header = \Smalot\PdfParser\Header::parse($content, $document, $position); - $this->assert->integer($position)->isEqualTo(8); - $this->assert->array($header->getElements())->hasSize(1); - -// var_dump($header); -// die(); - } - - public function testGetElements() - { - $document = new \Smalot\PdfParser\Document(); - - $content = '<>foo'; - $position = 0; - $header = \Smalot\PdfParser\Header::parse($content, $document, $position); - - $this->assert->array($elements = $header->getElements())->hasSize(2); - $this->assert->object(current($elements))->isInstanceOf('\Smalot\PdfParser\Element\ElementName'); - - $types = $header->getElementTypes(); - $this->assert->array($types); - $this->assert->string($types['Type'])->isEqualTo('Smalot\PdfParser\Element\ElementName'); - $this->assert->string($types['Subtype'])->isEqualTo('Smalot\PdfParser\Element\ElementName'); - } - - public function testHas() - { - $document = new \Smalot\PdfParser\Document(); - - $content = '<>foo'; - $position = 0; - $header = \Smalot\PdfParser\Header::parse($content, $document, $position); - - $this->assert->boolean($header->has('Type'))->isEqualTo(true); - $this->assert->boolean($header->has('SubType'))->isEqualTo(true); - $this->assert->boolean($header->has('Font'))->isEqualTo(true); - $this->assert->boolean($header->has('Text'))->isEqualTo(false); - } - - public function testGet() - { - $document = new \Smalot\PdfParser\Document(); - - $content = '<>foo'; - $position = 0; - $header = \Smalot\PdfParser\Header::parse($content, $document, $position); - $object = new \Smalot\PdfParser\Page($document, $header); - $document->setObjects(array('5_0' => $object)); - - $this->assert->object($header->get('Type'))->isInstanceOf('\Smalot\PdfParser\Element\ElementName'); - $this->assert->object($header->get('SubType'))->isInstanceOf('\Smalot\PdfParser\Element\ElementName'); - $this->assert->object($header->get('Font'))->isInstanceOf('\Smalot\PdfParser\Page'); - $this->assert->object($header->get('Image'))->isInstanceOf('\Smalot\PdfParser\Element\ElementMissing'); - - try { - $resources = $header->get('Resources'); - $this->assert->boolean(true)->isEqualTo(false); - } catch (\Exception $e) { - $this->assert->exception($e)->hasMessage('Missing object reference #8_0.'); - } - } - - public function testResolveXRef() - { - $document = new \Smalot\PdfParser\Document(); - $content = '<>foo'; - $position = 0; - $header = \Smalot\PdfParser\Header::parse($content, $document, $position); - $object = new \Smalot\PdfParser\Page($document, $header); - $document->setObjects(array('5_0' => $object)); - - $this->assert->object($header->get('Font'))->isInstanceOf('\Smalot\PdfParser\Object'); - - try { - $this->assert->object($header->get('Resources'))->isInstanceOf('\Smalot\PdfParser\Element\ElementMissing'); - $this->assert->boolean(true)->isEqualTo(false); - } catch (\Exception $e) { - $this->assert->exception($e)->hasMessage('Missing object reference #8_0.'); - } - } -} diff --git a/src/Smalot/PdfParser/Tests/Units/Object.php b/src/Smalot/PdfParser/Tests/Units/Object.php deleted file mode 100644 index e2bca86a..00000000 --- a/src/Smalot/PdfParser/Tests/Units/Object.php +++ /dev/null @@ -1,311 +0,0 @@ - - * @date 2013-08-08 - * @license GPL-3.0 - * @url - * - * PdfParser is a pdf library written in PHP, extraction oriented. - * Copyright (C) 2014 - Sébastien MALOT - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. - * If not, see . - * - */ - -namespace Smalot\PdfParser\Tests\Units; - -use mageekguy\atoum; - -/** - * Class Object - * - * @package Smalot\PdfParser\Tests\Units - */ -class Object extends atoum\test -{ - const TYPE = 't'; - - const OPERATOR = 'o'; - - const COMMAND = 'c'; - - public function testGetTextParts() - { - } - -// public function testGetCommandsImage() -// { -// $content = "/CS/RGB -///W 22 -///H 1 -///BPC 8 -///F/Fl -///DP<> -//ID \x00\x50c\x63 -//EI Q -//q -124.774 124.127 5.64213 5.67154 930.307 4436.95 cm -//BI -//"; -// -// $document = new \Smalot\PdfParser\Document(); -// $object = new \Smalot\PdfParser\Object($document); -// $offset = 0; -// $parts = $object->getCommandsImage($content, $offset); -// $reference = array( -// array( -// self::TYPE => '/', -// self::OPERATOR => 'CS', -// self::COMMAND => 'RGB', -// ), -// array( -// self::TYPE => '/', -// self::OPERATOR => 'W', -// self::COMMAND => '22', -// ), -// array( -// self::TYPE => '/', -// self::OPERATOR => 'H', -// self::COMMAND => '1', -// ), -// array( -// self::TYPE => '/', -// self::OPERATOR => 'BPC', -// self::COMMAND => '8', -// ), -// array( -// self::TYPE => '/', -// self::OPERATOR => 'F', -// self::COMMAND => 'Fl', -// ), -// array( -// self::TYPE => 'struct', -// self::OPERATOR => 'DP', -// self::COMMAND => array( -// array( -// self::TYPE => '/', -// self::OPERATOR => 'Predictor', -// self::COMMAND => '15', -// ), -// array( -// self::TYPE => '/', -// self::OPERATOR => 'Columns', -// self::COMMAND => '22', -// ), -// array( -// self::TYPE => '/', -// self::OPERATOR => 'Colors', -// self::COMMAND => '3', -// ), -// ), -// ), -// array( -// self::TYPE => '', -// self::OPERATOR => 'ID', -// self::COMMAND => "\x00\x50c\x63", -// ), -// ); -// -// $this->assert->array($parts)->isEqualTo($reference); -// $this->assert->integer($offset)->isEqualTo(83); -// } - - public function testGetCommandsText() - { - $content = "/R14 30 Tf 0.999016 0 0 1 137.4 -342.561 Tm -[(A)-168.854( BC D)-220.905(\\(E\\))20.905<20>] -TJ /R14 17.16 Tf <20> Tj -0.999014 0 0 1 336.84 319.161 Tm T* ( \x00m)Tj -/R14 20.04 Tf -ET Q -q -124.774 124.127 5.64213 5.67154 930.307 4436.95 cm -BI"; - - $document = new \Smalot\PdfParser\Document(); - $object = new \Smalot\PdfParser\Object($document); - $offset = 0; - $parts = $object->getCommandsText($content, $offset); - $reference = array( - array( - self::TYPE => '/', - self::OPERATOR => 'Tf', - self::COMMAND => 'R14 30', - ), - array( - self::TYPE => '', - self::OPERATOR => 'Tm', - self::COMMAND => "0.999016 0 0 1 137.4\n342.561", - ), - array( - self::TYPE => '[', - self::OPERATOR => 'TJ', - self::COMMAND => array( - array( - self::TYPE => '(', - self::OPERATOR => '', - self::COMMAND => 'A', - ), - array( - self::TYPE => 'n', - self::OPERATOR => '', - self::COMMAND => '-168.854', - ), - array( - self::TYPE => '(', - self::OPERATOR => '', - self::COMMAND => ' BC D', - ), - array( - self::TYPE => 'n', - self::OPERATOR => '', - self::COMMAND => '-220.905', - ), - array( - self::TYPE => '(', - self::OPERATOR => '', - self::COMMAND => '\\(E\\)', - ), - array( - self::TYPE => 'n', - self::OPERATOR => '', - self::COMMAND => '20.905', - ), - array( - self::TYPE => '<', - self::OPERATOR => '', - self::COMMAND => '20', - ), - ), - ), - array( - self::TYPE => '/', - self::OPERATOR => 'Tf', - self::COMMAND => 'R14 17.16', - ), - array( - self::TYPE => '<', - self::OPERATOR => 'Tj', - self::COMMAND => '20', - ), - array( - self::TYPE => '', - self::OPERATOR => 'Tm', - self::COMMAND => '0.999014 0 0 1 336.84 319.161', - ), - array( - self::TYPE => '', - self::OPERATOR => 'T*', - self::COMMAND => '', - ), - array( - self::TYPE => '(', - self::OPERATOR => 'Tj', - self::COMMAND => " \x00m", - ), - array( - self::TYPE => '/', - self::OPERATOR => 'Tf', - self::COMMAND => 'R14 20.04', - ), - ); - - $this->assert->array($parts)->isEqualTo($reference); - $this->assert->integer($offset)->isEqualTo(172); - } - - public function testCleanContent() - { - $content = '/Shape <>> BT >>BDC -Q -/CS0 cs 1 1 0 scn -1 i -/GS0 gs -BT -/TT0 1 Tf -0.0007 Tc 0.0018 Tw 0 Ts 100 Tz 0 Tr 24 0 0 24 51.3 639.26025 Tm -(Modificatio[ns] au \\(14\\) septembre 2009 ET 2010)Tj -EMC -(ABC) Tj - -[ (a)-4.5(b)6(c)8.8 ( fsdfsdfsdf[]sd) ] TD - -ET -/Shape <>BDC -q -0.03 841'; - - $expected = '_____________________________________ -Q -/CS0 cs 1 1 0 scn -1 i -/GS0 gs -BT -/TT0 1 Tf -0.0007 Tc 0.0018 Tw 0 Ts 100 Tz 0 Tr 24 0 0 24 51.3 639.26025 Tm -(________________________________________________)Tj -___ -(___) Tj - -[_____________________________________] TD - -ET -______________________ -q -0.03 841'; - - $document = new \Smalot\PdfParser\Document(); - $object = new \Smalot\PdfParser\Object($document); - $cleaned = $object->cleanContent($content, '_'); - - $this->assert->string($cleaned)->length->isEqualTo(strlen($content)); - $this->assert->string($cleaned)->isEqualTo($expected); - } - - public function testGetSectionText() - { - $content = '/Shape <>BDC -Q -/CS0 cs 1 1 0 scn -1 i -/GS0 gs -BT -/TT0 1 Tf -0.0007 Tc 0.0018 Tw 0 Ts 100 Tz 0 Tr 24 0 0 24 51.3 639.26025 Tm -(Mod BT atio[ns] au \\(14\\) septembre 2009 ET 2010)Tj -EMC -(ABC) Tj - -[ (a)-4.5(b) 6(c)8.8 ( fsdfsdfsdf[ sd) ] TD - -ET -/Shape <>BDC BT /TT1 1.5 Tf (BT )Tj ET -q -0.03 841'; - - $document = new \Smalot\PdfParser\Document(); - $object = new \Smalot\PdfParser\Object($document); - $sections = $object->getSectionsText($content); -// var_dump($sections); - -// $this->assert->string($cleaned)->length->isEqualTo(strlen($content)); -// $this->assert->string($cleaned)->isEqualTo($expected); - } -} diff --git a/src/Smalot/PdfParser/Tests/Units/Page.php b/src/Smalot/PdfParser/Tests/Units/Page.php deleted file mode 100644 index 9a90f905..00000000 --- a/src/Smalot/PdfParser/Tests/Units/Page.php +++ /dev/null @@ -1,114 +0,0 @@ - - * @date 2013-08-08 - * @license GPL-3.0 - * @url - * - * PdfParser is a pdf library written in PHP, extraction oriented. - * Copyright (C) 2014 - Sébastien MALOT - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. - * If not, see . - * - */ - -namespace Smalot\PdfParser\Tests\Units; - -use mageekguy\atoum; - -/** - * Class Page - * - * @package Smalot\PdfParser\Tests\Units - */ -class Page extends atoum\test -{ - public function testGetFonts() - { - // Document with text. - $filename = __DIR__ . '/../../../../../samples/Document1_pdfcreator_nocompressed.pdf'; - $parser = new \Smalot\PdfParser\Parser(); - $document = $parser->parseFile($filename); - $pages = $document->getPages(); - $page = $pages[0]; - - // the first to load data. - $fonts = $page->getFonts(); - $this->assert->array($fonts)->isNotEmpty(); - foreach ($fonts as $font) { - $this->assert->object($font)->isInstanceOf('\Smalot\PdfParser\Font'); - } - // the second to use cache. - $fonts = $page->getFonts(); - $this->assert->array($fonts)->isNotEmpty(); - - // ------------------------------------------------------ - // Document without text. - $filename = __DIR__ . '/../../../../../samples/Document3_pdfcreator_nocompressed.pdf'; - $document = $parser->parseFile($filename); - $pages = $document->getPages(); - $page = $pages[0]; - - // the first to load data. - $fonts = $page->getFonts(); - $this->assert->array($fonts)->isEmpty(); - // the second to use cache. - $fonts = $page->getFonts(); - $this->assert->array($fonts)->isEmpty(); - } - - public function testGetFont() - { - // Document with text. - $filename = __DIR__ . '/../../../../../samples/Document1_pdfcreator_nocompressed.pdf'; - $parser = new \Smalot\PdfParser\Parser(); - $document = $parser->parseFile($filename); - $pages = $document->getPages(); - $page = $pages[0]; - - // the first to load data. - $font = $page->getFont('R7'); - $this->assert->object($font)->isInstanceOf('\Smalot\PdfParser\Font'); - $font = $page->getFont('ABC7'); - $this->assert->object($font)->isInstanceOf('\Smalot\PdfParser\Font'); - } - - public function testGetText() - { - // Document with text. - $filename = __DIR__ . '/../../../../../samples/Document1_pdfcreator_nocompressed.pdf'; - $parser = new \Smalot\PdfParser\Parser(); - $document = $parser->parseFile($filename); - $pages = $document->getPages(); - $page = $pages[0]; - $text = $page->getText(); - -// var_dump($text); - - $this->assert->string($text)->hasLengthGreaterThan(150); - $this->assert->string($text)->contains('Document title'); - $this->assert->string($text)->contains('Lorem ipsum'); - - $this->assert->string($text)->contains('Calibri'); - $this->assert->string($text)->contains('Arial'); - $this->assert->string($text)->contains('Times'); - $this->assert->string($text)->contains('Courier New'); - $this->assert->string($text)->contains('Verdana'); - } -} diff --git a/src/Smalot/PdfParser/XObject/Form.php b/src/Smalot/PdfParser/XObject/Form.php index 0bb888a1..8e60647f 100644 --- a/src/Smalot/PdfParser/XObject/Form.php +++ b/src/Smalot/PdfParser/XObject/Form.php @@ -5,51 +5,46 @@ * This file is part of the PdfParser library. * * @author Sébastien MALOT - * @date 2013-08-08 - * @license GPL-3.0 + * + * @date 2017-01-03 + * + * @license LGPLv3 + * * @url * * PdfParser is a pdf library written in PHP, extraction oriented. - * Copyright (C) 2014 - Sébastien MALOT + * Copyright (C) 2017 - Sébastien MALOT * * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by + * it under the terms of the GNU Lesser General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. + * GNU Lesser General Public License for more details. * - * You should have received a copy of the GNU General Public License + * You should have received a copy of the GNU Lesser General Public License * along with this program. * If not, see . - * */ namespace Smalot\PdfParser\XObject; use Smalot\PdfParser\Header; -use Smalot\PdfParser\Object; use Smalot\PdfParser\Page; +use Smalot\PdfParser\PDFObject; /** * Class Form - * - * @package Smalot\PdfParser\XObject */ class Form extends Page { - /** - * @param Page - * - * @return string - */ - public function getText(Page $page = null) + public function getText(?Page $page = null): string { - $header = new Header(array(), $this->document); - $contents = new Object($this->document, $header, $this->content); + $header = new Header([], $this->document); + $contents = new PDFObject($this->document, $header, $this->content, $this->config); return $contents->getText($this); } diff --git a/src/Smalot/PdfParser/XObject/Image.php b/src/Smalot/PdfParser/XObject/Image.php index 818833af..6dc6b0a6 100644 --- a/src/Smalot/PdfParser/XObject/Image.php +++ b/src/Smalot/PdfParser/XObject/Image.php @@ -5,47 +5,42 @@ * This file is part of the PdfParser library. * * @author Sébastien MALOT - * @date 2013-08-08 - * @license GPL-3.0 + * + * @date 2017-01-03 + * + * @license LGPLv3 + * * @url * * PdfParser is a pdf library written in PHP, extraction oriented. - * Copyright (C) 2014 - Sébastien MALOT + * Copyright (C) 2017 - Sébastien MALOT * * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by + * it under the terms of the GNU Lesser General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. + * GNU Lesser General Public License for more details. * - * You should have received a copy of the GNU General Public License + * You should have received a copy of the GNU Lesser General Public License * along with this program. * If not, see . - * */ namespace Smalot\PdfParser\XObject; -use Smalot\PdfParser\Object; use Smalot\PdfParser\Page; +use Smalot\PdfParser\PDFObject; /** * Class Image - * - * @package Smalot\PdfParser\XObject */ -class Image extends Object +class Image extends PDFObject { - /** - * @param Page - * - * @return string - */ - public function getText(Page $page = null) + public function getText(?Page $page = null): string { return ''; } diff --git a/tests/AltAutoloading/AltAutoloadCheck.php b/tests/AltAutoloading/AltAutoloadCheck.php new file mode 100644 index 00000000..f638c74b --- /dev/null +++ b/tests/AltAutoloading/AltAutoloadCheck.php @@ -0,0 +1,43 @@ + + * + * @date 2021-02-09 + * + * @license LGPLv3 + * + * @url + * + * PdfParser is a pdf library written in PHP, extraction oriented. + * Copyright (C) 2017 - Sébastien MALOT + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program. + * If not, see . + */ +require __DIR__.'/../../alt_autoload.php-dist'; + +$parser = new Smalot\PdfParser\Parser(); + +$filename = __DIR__.'/../../samples/InternationalChars.pdf'; +$document = $parser->parseFile($filename); + +$needle = 'Лорем ипсум долор сит амет, еу сед либрис долорем инцоррупте.'; +if (0 !== strpos($document->getText(), $needle)) { + return 0; +} + +throw new Exception('Something went wrong. Alt-Autoload is not working.'); diff --git a/tests/PHPUnit/Integration/ConfigTest.php b/tests/PHPUnit/Integration/ConfigTest.php new file mode 100644 index 00000000..7b0ecc8e --- /dev/null +++ b/tests/PHPUnit/Integration/ConfigTest.php @@ -0,0 +1,58 @@ + + * + * @date 2020-06-01 + * + * @author Sébastien MALOT + * + * @date 2017-01-03 + * + * @license LGPLv3 + * + * @url + * + * PdfParser is a pdf library written in PHP, extraction oriented. + * Copyright (C) 2017 - Sébastien MALOT + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program. + * If not, see . + */ + +namespace PHPUnitTests\Integration; + +use PHPUnitTests\TestCase; +use Smalot\PdfParser\Config; + +class ConfigTest extends TestCase +{ + public function testHorizontalOffset() + { + $filename = $this->rootDir.'/samples/bugs/Issue494.pdf'; + + $config = new Config(); + $config->setHorizontalOffset(''); + + $parser = $this->getParserInstance($config); + $document = $parser->parseFile($filename); + $text = $document->getText(); + + $reference = '11 ADET DERGİ İÇİN 3 KALEM HİZMET ALIMI İHALE EDİLECEKTİR '; + $firstLine = explode("\n", $text)[0]; + $this->assertEquals($reference, $firstLine); + } +} diff --git a/tests/PHPUnit/Integration/DocumentGeneratorFocusTest.php b/tests/PHPUnit/Integration/DocumentGeneratorFocusTest.php new file mode 100644 index 00000000..f49f6f2d --- /dev/null +++ b/tests/PHPUnit/Integration/DocumentGeneratorFocusTest.php @@ -0,0 +1,225 @@ + + * + * @date 2020-06-01 + * + * @license LGPLv3 + * + * @url + * + * PdfParser is a pdf library written in PHP, extraction oriented. + * Copyright (C) 2017 - Sébastien MALOT + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program. + * If not, see . + */ + +namespace PHPUnitTests\Integration; + +use PHPUnitTests\TestCase; +use Smalot\PdfParser\Parser; + +/** + * Document related tests which focus on certain PDF generators. + */ +class DocumentGeneratorFocusTest extends TestCase +{ + /** + * Test getText result. + * + * PDF generated with Chromium 116 via SaveAs-dialog. + */ + public function testGetTextPull634Chromium(): void + { + $document = (new Parser())->parseFile($this->rootDir.'/samples/grouped-by-generator/R2RML-Spec_Generated_by_Chromium-SaveAs-PDF.pdf'); + + self::assertStringContainsString('R2RML: RDB to RDF Mapping Language', $document->getText()); + } + + /** + * Test getText result. + * + * PDF (v 1.4) generated with Inkscape 0.92. + */ + public function testGetTextPull634InkscapePDF14(): void + { + $document = (new Parser())->parseFile($this->rootDir.'/samples/grouped-by-generator/SimpleImage_Generated_by_Inkscape-0.92_PDF-v1.4.pdf'); + + self::assertEquals('TEST', $document->getText()); + } + + /** + * Test getText result. + * + * PDF (v 1.5) generated with Inkscape 0.92. + */ + public function testGetTextPull634InkscapePDF15(): void + { + $document = (new Parser())->parseFile($this->rootDir.'/samples/grouped-by-generator/SimpleImage_Generated_by_Inkscape-0.92_PDF-v1.5.pdf'); + + self::assertEquals('TEST', $document->getText()); + } + + /** + * Test getText result. + * + * PDF (1.4) generated with LibreOffice Writer (6.4). + * + * @see https://help.libreoffice.org/6.4/en-US/text/shared/01/ref_pdf_export.html + */ + public function testGetTextPull634LibreOffice(): void + { + $document = (new Parser())->parseFile($this->rootDir.'/samples/grouped-by-generator/RichDocument_Generated_by_Libreoffice-6.4_PDF-v1.4.pdf'); + + self::assertStringContainsString( + 'Some currency symbols: £, €, ¥'."\n".'German characters: ÄÖÜß', + $document->getText() + ); + } + + /** + * Test getText result. + * + * PDF (v 1.7) generated with Microsoft Print-to-PDF via Firefox. + */ + public function testGetTextPull634MicrosoftPDF17(): void + { + $document = (new Parser())->parseFile($this->rootDir.'/samples/grouped-by-generator/Wikipedia-PDF_Generated_by_Microsoft_Print-to-PDF.pdf'); + + $outputText = $document->getText(); + + self::assertStringContainsString( + 'Adobe PDF icon'."\n".'Filename'."\n".'extension', + $outputText + ); + + self::assertStringContainsString( + 'are necessary to make, use, sell, and distribute PDF-compliant', + $outputText + ); + } + + /** + * Test Document functions. + * + * PDF (v 1.5) generated by Microsoft Word 2016. + */ + public function testGetTextPull634MicrosoftWord2016(): void + { + $path = $this->rootDir.'/samples/grouped-by-generator/Mostly_German_Text_Generated_By_Microsoft_Word_2016.pdf'; + $document = (new Parser())->parseFile($path); + + $outputText = $document->getText(); + + self::assertStringContainsString('(einschließlich Marktpflegequote) von 4 Mrd € angestrebt.', $outputText); + + // check whitespaces and tab usage + self::assertStringContainsString( + // ,--- here is a tab + 'Fälligkeit: 19. Oktober 2028 '."\n". + 'Zinszahlung: 19. Oktober gzj., Zinslaufbeginn 15. Juni 2023', + $outputText + ); + } + + /** + * Test getText result. + * + * PDF (v 1.5) generated with Power PDF Create. + */ + public function testGetTextPull634PowerPDFCreate(): void + { + $document = (new Parser())->parseFile($this->rootDir.'/samples/grouped-by-generator/Mostly_German_Text_Generated_by_Power_PDF_Create.pdf'); + + $outputText = $document->getText(); + + // located on page 1 + self::assertStringContainsString( + 'Index-Verhältniszahl: 1,17812 (am Valutierungstag 7. September 2023)', + $outputText + ); + + // located on page 2 + self::assertStringContainsString( + 'Einbeziehung in den '."\n". + 'Börsenhandel: Dienstag, 5. September 2023 '."\n". + 'Valutierungstag: Donnerstag, 7. September 2023', + $outputText + ); + } + + /** + * Test getText result. + * + * PDF generated from .docx with SmallPDF (https://smallpdf.com) + */ + public function testGetTextPull634SmallPDF(): void + { + $document = (new Parser())->parseFile($this->rootDir.'/samples/grouped-by-generator/Document_Generated_by_SmallPDF.pdf'); + + $outputText = $document->getText(); + + // Actual encoded spaces in the document are preserved + self::assertStringContainsString( + 'SmallPDF SMALLPDF SmallPDF', + $outputText + ); + + // Hebrew text + self::assertStringContainsString( + 'Hebrew Keyboard - תדלקמ תירבעב - Type Hebrew Online', + $outputText + ); + + // Russian text + self::assertStringContainsString( + 'Russian Keyboard - русская клавиатура - Type Russian', + $outputText + ); + } + + /** + * Test getText result. + * + * PDF (1.6) generated by Word için Acrobat PDFMaker 17. + */ + public function testGetTextPull634WordIcinAcrobatPDFMaker17(): void + { + $path = $this->rootDir.'/samples/grouped-by-generator/Mostly_Turkish_Text_Generated_by_Word_için_Acrobat_PDFMaker_17.pdf'; + $document = (new Parser())->parseFile($path); + + $outputText = $document->getText(); + + self::assertStringContainsString( + 'İhracat ve döviz kazandırıcı hizmetler reeskont kredisi günlük', + $outputText + ); + + // Unnecessary tabs are not inserted due to font-size being 1, + // but the text-matrix scale is 9 or 10 + self::assertStringContainsString( + 'dikkate alınmasına devam edilecektir.', + $outputText + ); + + // This encoded segment contains an escaped backslash right before + // an octal code: \\\000. Account for this in Font::decodeOctal() + // See: https://github.com/smalot/pdfparser/pull/640 + self::assertStringContainsString('Sayı: 2023-34', $outputText); + } +} diff --git a/tests/PHPUnit/Integration/DocumentIssueFocusTest.php b/tests/PHPUnit/Integration/DocumentIssueFocusTest.php new file mode 100644 index 00000000..7c7fe7e6 --- /dev/null +++ b/tests/PHPUnit/Integration/DocumentIssueFocusTest.php @@ -0,0 +1,114 @@ + + * + * @date 2020-06-01 + * + * @author Sébastien MALOT + * + * @date 2017-01-03 + * + * @license LGPLv3 + * + * @url + * + * PdfParser is a pdf library written in PHP, extraction oriented. + * Copyright (C) 2017 - Sébastien MALOT + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program. + * If not, see . + */ + +namespace PHPUnitTests\Integration; + +use PHPUnitTests\TestCase; +use Smalot\PdfParser\Document; +use Smalot\PdfParser\Parser; + +/** + * Document related tests which are related to certain issues. + */ +class DocumentIssueFocusTest extends TestCase +{ + /** + * Tests getText method without a given page limit. + * + * @see https://github.com/smalot/pdfparser/pull/562 + */ + public function testGetTextNoPageLimit(): void + { + $document = (new Parser())->parseFile($this->rootDir.'/samples/bugs/Issue331.pdf'); + + self::assertStringContainsString('Medeni Usul ve İcra İflas Hukuku', $document->getText()); + } + + /** + * Tests getText method with a given page limit. + * + * @see https://github.com/smalot/pdfparser/pull/562 + */ + public function testGetTextWithPageLimit(): void + { + $document = (new Parser())->parseFile($this->rootDir.'/samples/bugs/Issue331.pdf'); + + // given text is on page 2, it has to be ignored because of that + self::assertStringNotContainsString('Medeni Usul ve İcra İflas Hukuku', $document->getText(1)); + } + + /** + * Tests extraction of XMP Metadata vs. getHeader() data. + * + * @see https://github.com/smalot/pdfparser/pull/606 + */ + public function testExtractXMPMetadata(): void + { + $document = (new Parser())->parseFile($this->rootDir.'/samples/XMP_Metadata.pdf'); + + $details = $document->getDetails(); + + // Test that the dc:title data was extracted from the XMP + // Metadata. + self::assertStringContainsString("Enhance PdfParser\u{2019}s Metadata Capabilities", $details['dc:title']); + } + + /** + * Tests PDFDocEncoding decode of Document Properties + * + * @see https://github.com/smalot/pdfparser/issues/609 + */ + public function testPDFDocEncodingDecode(): void + { + $document = (new Parser())->parseFile($this->rootDir.'/samples/bugs/Issue609.pdf'); + + $details = $document->getDetails(); + + // These test that Adobe-inserted \r are removed from a UTF-8 + // escaped metadata string, and the surrounding characters are + // repaired + $testKeywords = '˘ˇˆ˙˝˛˞˜•†‡…—–ƒ⁄‹›−‰„“”‘’‚™fiflŁŒŠŸŽıłœšž€¡¢£¤¥¦§¨©ª«¬®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖרÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ'; + self::assertStringContainsString($testKeywords, $details['Keywords']); + + $testKeywords = 'added line-feeds often destroy multibyte characters'; + self::assertStringContainsString($testKeywords, $details['Keywords']); + + // This tests that the PDFDocEncoding characters that differ + // from CP-1252 are decoded to their correct UTF-8 code points + // as well as removing \r line-feeds + $testSubject = '•†‡…—–ƒ⁄‹›−‰„“”‘’‚™ŁŒŠŸŽıłœšž'; + self::assertStringContainsString($testSubject, $details['Subject']); + } +} diff --git a/tests/PHPUnit/Integration/DocumentTest.php b/tests/PHPUnit/Integration/DocumentTest.php new file mode 100644 index 00000000..346ba633 --- /dev/null +++ b/tests/PHPUnit/Integration/DocumentTest.php @@ -0,0 +1,267 @@ + + * + * @date 2020-06-01 + * + * @author Sébastien MALOT + * + * @date 2017-01-03 + * + * @license LGPLv3 + * + * @url + * + * PdfParser is a pdf library written in PHP, extraction oriented. + * Copyright (C) 2017 - Sébastien MALOT + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program. + * If not, see . + */ + +namespace PHPUnitTests\Integration; + +use PHPUnitTests\TestCase; +use Smalot\PdfParser\Document; +use Smalot\PdfParser\Header; +use Smalot\PdfParser\Page; +use Smalot\PdfParser\Pages; +use Smalot\PdfParser\PDFObject; + +/** + * General Document related tests. + */ +class DocumentTest extends TestCase +{ + protected function getDocumentInstance(): Document + { + return new Document(); + } + + protected function getPDFObjectInstance(Document $document, ?Header $header = null): PDFObject + { + return new PDFObject($document, $header); + } + + protected function getPageInstance(Document $document, Header $header): PDFObject + { + return new Page($document, $header); + } + + protected function getPagesInstance(Document $document, Header $header): PDFObject + { + return new Pages($document, $header); + } + + public function testSetObjects(): void + { + $document = $this->getDocumentInstance(); + $object = $this->getPDFObjectInstance($document); + + // Obj #1 is missing + $this->assertNull($document->getObjectById(1)); + $document->setObjects([1 => $object]); + + // Obj #1 exists + $this->assertTrue($document->getObjectById(1) instanceof PDFObject); + + $content = '<>'; + $header = Header::parse($content, $document); + $object = $this->getPDFObjectInstance($document, $header); + $document->setObjects([2 => $object]); + + // Obj #1 is missing + $this->assertNull($document->getObjectById(1)); + + // Obj #2 exists + $this->assertTrue($document->getObjectById(2) instanceof PDFObject); + } + + public function testGetObjects(): void + { + $document = $this->getDocumentInstance(); + $object1 = $this->getPDFObjectInstance($document); + $content = '<>unparsed content'; + $header = Header::parse($content, $document); + + $object2 = $this->getPageInstance($document, $header); + $document->setObjects([1 => $object1, 2 => $object2]); + + $objects = $document->getObjects(); + $this->assertEquals(2, \count($objects)); + $this->assertTrue($objects[1] instanceof PDFObject); + $this->assertTrue($objects[2] instanceof PDFObject); + $this->assertTrue($objects[2] instanceof Page); + } + + public function testDictionary(): void + { + $document = $this->getDocumentInstance(); + $objects = $document->getDictionary(); + $this->assertEquals(0, \count($objects)); + $object1 = $this->getPDFObjectInstance($document); + + $content = '<>'; + $header = Header::parse($content, $document); + $object2 = $this->getPageInstance($document, $header); + $document->setObjects([1 => $object1, 2 => $object2]); + + $objects = $document->getDictionary(); + $this->assertEquals(1, \count($objects)); + $this->assertEquals(1, \count($objects['Page']['all'])); + $this->assertEquals($object2, $objects['Page']['all'][2]); + } + + public function testGetObjectsByType(): void + { + $document = $this->getDocumentInstance(); + $object1 = $this->getPDFObjectInstance($document); + $content = '<>'; + $header = Header::parse($content, $document); + $object2 = $this->getPageInstance($document, $header); + $document->setObjects([1 => $object1, 2 => $object2]); + + $objects = $document->getObjectsByType('Page'); + $this->assertEquals(1, \count($objects)); + $this->assertTrue($objects[2] instanceof PDFObject); + $this->assertTrue($objects[2] instanceof Page); + } + + public function testGetPages(): void + { + $document = $this->getDocumentInstance(); + + // Listing pages from type Page + $content = '<>'; + $header = Header::parse($content, $document); + $object1 = $this->getPageInstance($document, $header); + $header = Header::parse($content, $document); + $object2 = $this->getPageInstance($document, $header); + $document->setObjects([1 => $object1, 2 => $object2]); + $pages = $document->getPages(); + + $this->assertEquals(2, \count($pages)); + $this->assertTrue($pages[0] instanceof Page); + $this->assertTrue($pages[1] instanceof Page); + + // Listing pages from type Pages (kids) + $content = '<>'; + $header = Header::parse($content, $document); + $object1 = $this->getPageInstance($document, $header); + $header = Header::parse($content, $document); + $object2 = $this->getPageInstance($document, $header); + $header = Header::parse($content, $document); + $object3 = $this->getPageInstance($document, $header); + + $content = '<>'; + $header = Header::parse($content, $document); + $object4 = $this->getPagesInstance($document, $header); + + $content = '<>'; + $header = Header::parse($content, $document); + $object5 = $this->getPagesInstance($document, $header); + + $document->setObjects([ + '1_0' => $object1, + '2_0' => $object2, + '3_0' => $object3, + '4_0' => $object4, + '5_0' => $object5, + ]); + $pages = $document->getPages(); + + $this->assertEquals(3, \count($pages)); + $this->assertTrue($pages[0] instanceof Page); + $this->assertTrue($pages[1] instanceof Page); + $this->assertTrue($pages[2] instanceof Page); + + // Listing pages from type Catalog + $content = '<>'; + $header = Header::parse($content, $document); + $object1 = $this->getPageInstance($document, $header); + $header = Header::parse($content, $document); + $object2 = $this->getPageInstance($document, $header); + $header = Header::parse($content, $document); + $object3 = $this->getPageInstance($document, $header); + $content = '<>'; + $header = Header::parse($content, $document); + $object4 = $this->getPagesInstance($document, $header); + $content = '<>'; + $header = Header::parse($content, $document); + $object5 = $this->getPagesInstance($document, $header); + $content = '<>'; + $header = Header::parse($content, $document); + $object6 = $this->getPagesInstance($document, $header); + $document->setObjects( + [ + '1_0' => $object1, + '2_0' => $object2, + '3_0' => $object3, + '4_0' => $object4, + '5_0' => $object5, + '6_0' => $object6, + ] + ); + $pages = $document->getPages(); + $this->assertEquals(3, \count($pages)); + $this->assertTrue($pages[0] instanceof Page); + $this->assertTrue($pages[1] instanceof Page); + $this->assertTrue($pages[2] instanceof Page); + } + + public function testGetPagesMissingCatalog(): void + { + $this->expectException(\Exception::class); + $this->expectExceptionMessage('Missing catalog.'); + + // Missing catalog + $document = $this->getDocumentInstance(); + $document->getPages(); + } + + /** + * @see https://github.com/smalot/pdfparser/issues/721 + */ + public function testExtractXMPMetadataIssue721(): void + { + $document = $this->getDocumentInstance(); + + // Check that XMP metadata is parsed even if missing a dc:format tag + $content = ' + + + + + + PdfParser + + + 2018-02-07T11:51:44-05:00 + 2019-10-23T09:56:01-04:00 + + +'; + + $document->extractXMPMetadata($content); + $document->init(); + $details = $document->getDetails(); + + $this->assertEquals(4, \count($details)); + $this->assertEquals('PdfParser', $details['dc:creator']); + $this->assertEquals('2019-10-23T09:56:01-04:00', $details['xmp:modifydate']); + } +} diff --git a/tests/PHPUnit/Integration/Element/ElementArrayTest.php b/tests/PHPUnit/Integration/Element/ElementArrayTest.php new file mode 100644 index 00000000..0b095a41 --- /dev/null +++ b/tests/PHPUnit/Integration/Element/ElementArrayTest.php @@ -0,0 +1,187 @@ + + * + * @date 2020-06-02 + * + * @author Sébastien MALOT + * + * @date 2017-01-03 + * + * @license LGPLv3 + * + * @url + * + * PdfParser is a pdf library written in PHP, extraction oriented. + * Copyright (C) 2017 - Sébastien MALOT + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program. + * If not, see . + */ + +namespace PHPUnitTests\Integration\Element; + +use PHPUnitTests\TestCase; +use Smalot\PdfParser\Document; +use Smalot\PdfParser\Element\ElementArray; +use Smalot\PdfParser\Element\ElementNumeric; +use Smalot\PdfParser\Header; +use Smalot\PdfParser\Page; + +class ElementArrayTest extends TestCase +{ + public function testParse(): void + { + $document = $this->getDocumentInstance(); + + // Skipped. + $offset = 0; + $element = ElementArray::parse('ABC', $document, $offset); + $this->assertFalse($element); + $this->assertEquals(0, $offset); + + $offset = 0; + $element = ElementArray::parse(' / [ 4 2 ] ', $document, $offset); + $this->assertFalse($element); + $this->assertEquals(0, $offset); + + $offset = 0; + $element = ElementArray::parse(' 0 [ 4 2 ] ', $document, $offset); + $this->assertFalse($element); + $this->assertEquals(0, $offset); + + $offset = 0; + $element = ElementArray::parse(" 0 \n [ 4 2 ] ", $document, $offset); + $this->assertFalse($element); + $this->assertEquals(0, $offset); + + // Valid. + $offset = 0; + $element = ElementArray::parse(' [ 4 2 ] ', $document, $offset); + $this->assertTrue($element->contains(2)); + $this->assertTrue($element->contains(4)); + $this->assertFalse($element->contains(8)); + $this->assertEquals(8, $offset); + + $offset = 0; + $element = ElementArray::parse(' [ 4 2 ]', $document, $offset); + $this->assertTrue($element->contains(2)); + $this->assertTrue($element->contains(4)); + $this->assertFalse($element->contains(8)); + $this->assertEquals(8, $offset); + + $offset = 0; + $element = ElementArray::parse('[ 4 2 ]', $document, $offset); + $this->assertTrue($element->contains(2)); + $this->assertTrue($element->contains(4)); + $this->assertFalse($element->contains(8)); + $this->assertEquals(7, $offset); + + $offset = 0; + $element = ElementArray::parse(" \n [ 4 2 ] ", $document, $offset); + $this->assertTrue($element->contains(2)); + $this->assertTrue($element->contains(4)); + $this->assertFalse($element->contains(8)); + $this->assertEquals(10, $offset); + } + + public function testGetContent(): void + { + $val_4 = new ElementNumeric('4'); + $val_2 = new ElementNumeric('2'); + $element = new ElementArray([$val_4, $val_2]); + + $content = $element->getContent(); + $this->assertCount(2, $content); + } + + public function testContains(): void + { + $val_4 = new ElementNumeric('4'); + $val_2 = new ElementNumeric('2'); + $element = new ElementArray([$val_4, $val_2]); + + $this->assertTrue($element->contains(2)); + $this->assertTrue($element->contains(4)); + + $this->assertFalse($element->contains(8)); + } + + public function testResolveXRef(): void + { + // Document with text. + $filename = $this->rootDir.'/samples/Document1_pdfcreator_nocompressed.pdf'; + $parser = $this->getParserInstance(); + $document = $parser->parseFile($filename); + $object = $document->getObjectById('3_0'); + $kids = $object->get('Kids'); + + $this->assertTrue($kids instanceof ElementArray); + $this->assertCount(1, $kids->getContent()); + + $pages = $kids->getContent(); + $this->assertTrue(reset($pages) instanceof Page); + } + + public function testGetDetails(): void + { + $document = $this->getDocumentInstance(); + $content = '<> [8 [9 <>]]]>>'; + $details_reference = [ + 'Type' => 'Page', + 'Types' => [ + 8, + ], + 'Sizes' => [ + 1, + 2, + 3, + 4, + 5, + [ + 'Subtype' => 'XObject', + ], + [ + 8, + [ + 9, + [ + 'FontSize' => 10, + ], + ], + ], + ], + ]; + $header = Header::parse($content, $document); + $details = $header->getDetails(); + + $this->assertCount(3, $details); + $this->assertEquals($details_reference, $details); + } + + public function testToString(): void + { + $val_4 = new ElementNumeric('4'); + $val_2 = new ElementNumeric('2'); + $element = new ElementArray([$val_4, $val_2]); + $this->assertEquals('4,2', (string) $element); + + $document = $this->getDocumentInstance(); + $element = ElementArray::parse(' [ 4 2 ]', $document); + $this->assertEquals('4,2', (string) $element); + } +} diff --git a/tests/PHPUnit/Integration/Element/ElementBooleanTest.php b/tests/PHPUnit/Integration/Element/ElementBooleanTest.php new file mode 100644 index 00000000..84d83cd8 --- /dev/null +++ b/tests/PHPUnit/Integration/Element/ElementBooleanTest.php @@ -0,0 +1,148 @@ + + * + * @date 2020-06-02 + * + * @author Sébastien MALOT + * + * @date 2017-01-03 + * + * @license LGPLv3 + * + * @url + * + * PdfParser is a pdf library written in PHP, extraction oriented. + * Copyright (C) 2017 - Sébastien MALOT + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program. + * If not, see . + */ + +namespace PHPUnitTests\Integration\Element; + +use PHPUnitTests\TestCase; +use Smalot\PdfParser\Element\ElementBoolean; + +class ElementBooleanTest extends TestCase +{ + public function testParse(): void + { + // Skipped. + $offset = 0; + $element = ElementBoolean::parse('ABC', null, $offset); + $this->assertFalse($element); + $this->assertEquals(0, $offset); + + $offset = 0; + $element = ElementBoolean::parse(' [ false ]', null, $offset); + $this->assertFalse($element); + $this->assertEquals(0, $offset); + + $offset = 0; + $element = ElementBoolean::parse(' << true >>', null, $offset); + $this->assertFalse($element); + $this->assertEquals(0, $offset); + + $offset = 0; + $element = ElementBoolean::parse(' / false ', null, $offset); + $this->assertFalse($element); + $this->assertEquals(0, $offset); + + $offset = 0; + $element = ElementBoolean::parse(' 0 true ', null, $offset); + $this->assertFalse($element); + $this->assertEquals(0, $offset); + + $offset = 0; + $element = ElementBoolean::parse(" 0 \n true ", null, $offset); + $this->assertFalse($element); + $this->assertEquals(0, $offset); + + // Valid. + $offset = 0; + $element = ElementBoolean::parse(' true ', null, $offset); + $this->assertTrue($element->getContent()); + $this->assertEquals(5, $offset); + + $offset = 0; + $element = ElementBoolean::parse(' TRUE ', null, $offset); + $this->assertTrue($element->getContent()); + $this->assertEquals(5, $offset); + + $offset = 0; + $element = ElementBoolean::parse(' True', null, $offset); + $this->assertTrue($element->getContent()); + $this->assertEquals(5, $offset); + + $offset = 0; + $element = ElementBoolean::parse('true', null, $offset); + $this->assertTrue($element->getContent()); + $this->assertEquals(4, $offset); + + $offset = 0; + $element = ElementBoolean::parse('False', null, $offset); + $this->assertFalse($element->getContent()); + $this->assertEquals(5, $offset); + + $offset = 0; + $element = ElementBoolean::parse(" \n true ", null, $offset); + $this->assertTrue($element->getContent()); + $this->assertEquals(7, $offset); + } + + public function testGetContent(): void + { + $element = new ElementBoolean('true'); + $this->assertTrue($element->getContent()); + + $element = new ElementBoolean('false'); + $this->assertFalse($element->getContent()); + } + + public function testEquals(): void + { + $element = new ElementBoolean('true'); + $this->assertTrue($element->equals(true)); + $this->assertFalse($element->equals(1)); + $this->assertFalse($element->equals(false)); + $this->assertFalse($element->equals(null)); + + $element = new ElementBoolean('false'); + $this->assertTrue($element->equals(false)); + $this->assertFalse($element->equals(0)); + $this->assertFalse($element->equals(true)); + $this->assertFalse($element->equals(null)); + } + + public function testContains(): void + { + $element = new ElementBoolean('true'); + $this->assertTrue($element->contains(true)); + $this->assertFalse($element->contains(false)); + $this->assertFalse($element->contains(1)); + } + + public function testToString(): void + { + $element = new ElementBoolean('true'); + $this->assertEquals('true', (string) $element); + + $element = new ElementBoolean('false'); + $this->assertEquals('false', (string) $element); + } +} diff --git a/tests/PHPUnit/Integration/Element/ElementDateTest.php b/tests/PHPUnit/Integration/Element/ElementDateTest.php new file mode 100644 index 00000000..3f0d5e4e --- /dev/null +++ b/tests/PHPUnit/Integration/Element/ElementDateTest.php @@ -0,0 +1,183 @@ + + * + * @date 2020-06-02 + * + * @author Sébastien MALOT + * + * @date 2017-01-03 + * + * @license LGPLv3 + * + * @url + * + * PdfParser is a pdf library written in PHP, extraction oriented. + * Copyright (C) 2017 - Sébastien MALOT + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program. + * If not, see . + */ + +namespace PHPUnitTests\Integration\Element; + +use PHPUnitTests\TestCase; +use Smalot\PdfParser\Element\ElementDate; + +class ElementDateTest extends TestCase +{ + public function testParse(): void + { + // Skipped. + $offset = 0; + $element = ElementDate::parse('ABC', null, $offset); + $this->assertFalse($element); + $this->assertEquals(0, $offset); + + $offset = 0; + $element = ElementDate::parse(' [ (ABC) 5 6 ]', null, $offset); + $this->assertFalse($element); + $this->assertEquals(0, $offset); + + $offset = 0; + $element = ElementDate::parse(' << (invalid) >>', null, $offset); + $this->assertFalse($element); + $this->assertEquals(0, $offset); + + $offset = 0; + $element = ElementDate::parse(' / (FlateDecode) ', null, $offset); + $this->assertFalse($element); + $this->assertEquals(0, $offset); + + $offset = 0; + $element = ElementDate::parse(' 0 (FlateDecode) ', null, $offset); + $this->assertFalse($element); + $this->assertEquals(0, $offset); + + $offset = 0; + $element = ElementDate::parse(" 0 \n (FlateDecode) ", null, $offset); + $this->assertFalse($element); + $this->assertEquals(0, $offset); + + // Valid. + $offset = 0; + $element = ElementDate::parse(' (D:20130901235555+02\'00\') ', null, $offset); + $element->setFormat('c'); + $this->assertTrue($element->getContent() instanceof \DateTime); + $this->assertEquals('2013-09-01T23:55:55+02:00', (string) $element); + $this->assertEquals(26, $offset); + + $offset = 0; + $element = ElementDate::parse(' (D:20130901235555+02\'00\') ', null, $offset); + $element->setFormat('c'); + $this->assertTrue($element->getContent() instanceof \DateTime); + $this->assertEquals('2013-09-01T23:55:55+02:00', (string) $element); + $this->assertEquals(26, $offset); + + $offset = 0; + $element = ElementDate::parse(' (D:20130901235555+02\'00\')', null, $offset); + $element->setFormat('c'); + $this->assertTrue($element->getContent() instanceof \DateTime); + $this->assertEquals('2013-09-01T23:55:55+02:00', (string) $element); + $this->assertEquals(26, $offset); + + $offset = 0; + $element = ElementDate::parse('(D:20130901235555+02\'00\')', null, $offset); + $element->setFormat('c'); + $this->assertTrue($element->getContent() instanceof \DateTime); + $this->assertEquals('2013-09-01T23:55:55+02:00', (string) $element); + $this->assertEquals(25, $offset); + + $offset = 0; + $element = ElementDate::parse(" \n (D:20130901235555+02'00') ", null, $offset); + $element->setFormat('c'); + $this->assertTrue($element->getContent() instanceof \DateTime); + $this->assertEquals('2013-09-01T23:55:55+02:00', (string) $element); + $this->assertEquals(28, $offset); + + $offset = 0; + $element = ElementDate::parse(" \n (D:20130901235555) ", null, $offset); + $element->setFormat('c'); + $this->assertTrue($element->getContent() instanceof \DateTime); + $this->assertEquals('2013-09-01T23:55:55+00:00', (string) $element); + $this->assertEquals(21, $offset); + + $offset = 0; + $element = ElementDate::parse("(D:20131206091846Z00'00')", null, $offset); + $element->setFormat('c'); + $this->assertTrue($element->getContent() instanceof \DateTime); + $this->assertEquals('2013-12-06T09:18:46+00:00', (string) $element); + $this->assertEquals(25, $offset); + + $offset = 0; + $element = ElementDate::parse(" \n (D:1-23-2014, 19:02:15-03'00') ", null, $offset); + $element->setFormat('c'); + $this->assertTrue($element->getContent() instanceof \DateTime); + $this->assertEquals('2014-01-23T19:02:15-03:00', (string) $element); + $this->assertEquals(33, $offset); + + // Format invalid + $offset = 0; + $element = ElementDate::parse(" \n (D:2013+02'00') ", null, $offset); + $this->assertFalse($element); + $this->assertEquals(0, $offset); + } + + public function testGetContent(): void + { + $element = new ElementDate(new \DateTime('2013-09-01 23:55:55+02:00')); + $this->assertEquals(new \DateTime('2013-09-01 21:55:55+00:00'), $element->getContent()); + } + + public function testGetContentInvalidParameter(): void + { + $this->expectException(\Exception::class); + + $element = new ElementDate('2013-09-01 23:55:55+02:00'); + $this->assertEquals(new \DateTime('2013-09-01 21:55:55+02:00'), $element->getContent()); + } + + public function testEquals(): void + { + $element = new ElementDate(new \DateTime('2013-09-01 23:55:55+02:00')); + $element->setFormat('c'); + + $this->assertTrue($element->equals('2013-09-01T23:55:55+02:00')); + $this->assertFalse($element->equals('2013-09-01T23:55:55+01:00')); + + $this->assertTrue($element->equals(new \DateTime('2013-09-01T21:55:55+00:00'))); + $this->assertFalse($element->equals(new \DateTime('2013-09-01T23:55:55+01:00'))); + + $this->assertFalse($element->equals('ABC')); + } + + public function testContains(): void + { + $element = new ElementDate(new \DateTime('2013-09-01 23:55:55+02:00')); + + $this->assertTrue($element->contains('2013-09-01T21:55:55+00:00')); + $this->assertFalse($element->contains('2013-06-15')); + } + + public function testToString(): void + { + $element = new ElementDate(new \DateTime('2013-09-01 23:55:55+02:00')); + + $element->setFormat('c'); + $this->assertEquals('2013-09-01T23:55:55+02:00', (string) $element); + } +} diff --git a/tests/PHPUnit/Integration/Element/ElementHexaTest.php b/tests/PHPUnit/Integration/Element/ElementHexaTest.php new file mode 100644 index 00000000..1c22a5c3 --- /dev/null +++ b/tests/PHPUnit/Integration/Element/ElementHexaTest.php @@ -0,0 +1,134 @@ + + * + * @date 2020-06-02 + * + * @author Sébastien MALOT + * + * @date 2017-01-03 + * + * @license LGPLv3 + * + * @url + * + * PdfParser is a pdf library written in PHP, extraction oriented. + * Copyright (C) 2017 - Sébastien MALOT + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program. + * If not, see . + */ + +namespace PHPUnitTests\Integration\Element; + +use PHPUnitTests\TestCase; +use Smalot\PdfParser\Element\ElementDate; +use Smalot\PdfParser\Element\ElementHexa; +use Smalot\PdfParser\Element\ElementString; + +class ElementHexaTest extends TestCase +{ + public function testParse(): void + { + // Skipped. + $offset = 0; + $element = ElementHexa::parse('ABC', null, $offset); + $this->assertFalse($element); + $this->assertEquals(0, $offset); + + $offset = 0; + $element = ElementHexa::parse(' [ <0020> 5 6 ]', null, $offset); + $this->assertFalse($element); + $this->assertEquals(0, $offset); + + $offset = 0; + $element = ElementHexa::parse(' << <0020> >>', null, $offset); + $this->assertFalse($element); + $this->assertEquals(0, $offset); + + $offset = 0; + $element = ElementHexa::parse(' / <0020> ', null, $offset); + $this->assertFalse($element); + $this->assertEquals(0, $offset); + + $offset = 0; + $element = ElementHexa::parse(' 0 <0020> ', null, $offset); + $this->assertFalse($element); + $this->assertEquals(0, $offset); + + $offset = 0; + $element = ElementHexa::parse(" 0 \n <0020> ", null, $offset); + $this->assertFalse($element); + $this->assertEquals(0, $offset); + + // Valid. + $offset = 0; + $element = ElementHexa::parse(' <0020> ', null, $offset); + $this->assertEquals(' ', $element->getContent()); + $this->assertEquals(7, $offset); + + $offset = 0; + $element = ElementHexa::parse(' <0020> ', null, $offset); + $this->assertEquals(' ', $element->getContent()); + $this->assertEquals(7, $offset); + + $offset = 0; + $element = ElementHexa::parse(' <0020>', null, $offset); + $this->assertEquals(' ', $element->getContent()); + $this->assertEquals(7, $offset); + + $offset = 0; + $element = ElementHexa::parse('<0020>', null, $offset); + $this->assertEquals(' ', $element->getContent()); + $this->assertEquals(6, $offset); + + $offset = 0; + $element = ElementHexa::parse(" \n <0020> ", null, $offset); + $this->assertEquals(' ', $element->getContent()); + $this->assertEquals(9, $offset); + + $offset = 0; + $element = ElementHexa::parse(" \n <5465616d204d616e6167656d656e742053797374656d73> ", null, $offset); + $this->assertEquals('Team Management Systems', $element->getContent()); + $this->assertEquals(51, $offset); + + $offset = 0; + $element = ElementHexa::parse(" \n <5265706f72744275696c646572> ", null, $offset); + $this->assertTrue($element instanceof ElementString); + $this->assertEquals('ReportBuilder', $element->getContent()); + $this->assertEquals(31, $offset); + + $offset = 0; + $element = ElementHexa::parse(" \n <443a3230313331323137313334303435303027303027> ", null, $offset); + $this->assertTrue($element instanceof ElementDate); + $this->assertEquals('2013-12-17T13:40:45+00:00', (string) $element); + $this->assertEquals(49, $offset); + + // Test that a hexadecimal string 'dirty' with extra characters + // such as newlines or spaces is properly decoded + $element = ElementHexa::decode(' '); + + $this->assertEquals('pasqua, primavera, resurrezione, festa cristiana, gesù, uova di cioccolata, coniglietti, pulcini, pasquale, campane, dina rebucci, uova di pasqua, ', $element); + } +} diff --git a/tests/PHPUnit/Integration/Element/ElementMissingTest.php b/tests/PHPUnit/Integration/Element/ElementMissingTest.php new file mode 100644 index 00000000..0930cf5a --- /dev/null +++ b/tests/PHPUnit/Integration/Element/ElementMissingTest.php @@ -0,0 +1,72 @@ + + * + * @date 2020-06-02 + * + * @author Sébastien MALOT + * + * @date 2017-01-03 + * + * @license LGPLv3 + * + * @url + * + * PdfParser is a pdf library written in PHP, extraction oriented. + * Copyright (C) 2017 - Sébastien MALOT + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program. + * If not, see . + */ + +namespace PHPUnitTests\Integration\Element; + +use PHPUnitTests\TestCase; +use Smalot\PdfParser\Element\ElementMissing; + +class ElementMissingTest extends TestCase +{ + public function testEquals(): void + { + $element = new ElementMissing(); + $this->assertFalse($element->equals(null)); + $this->assertFalse($element->equals(true)); + $this->assertFalse($element->equals('A')); + $this->assertFalse($element->equals(false)); + } + + public function testGetContent(): void + { + $element = new ElementMissing(); + $this->assertFalse($element->getContent()); + } + + public function testContains(): void + { + $element = new ElementMissing(); + $this->assertFalse($element->contains(null)); + $this->assertFalse($element->contains(true)); + $this->assertFalse($element->contains('A')); + $this->assertFalse($element->contains(false)); + } + + public function testToString(): void + { + $element = new ElementMissing(); + $this->assertEquals('', (string) $element); + } +} diff --git a/tests/PHPUnit/Integration/Element/ElementNameTest.php b/tests/PHPUnit/Integration/Element/ElementNameTest.php new file mode 100644 index 00000000..0f390836 --- /dev/null +++ b/tests/PHPUnit/Integration/Element/ElementNameTest.php @@ -0,0 +1,164 @@ + + * + * @date 2020-06-02 + * + * @author Sébastien MALOT + * + * @date 2017-01-03 + * + * @license LGPLv3 + * + * @url + * + * PdfParser is a pdf library written in PHP, extraction oriented. + * Copyright (C) 2017 - Sébastien MALOT + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program. + * If not, see . + */ + +namespace PHPUnitTests\Integration\Element; + +use PHPUnitTests\TestCase; +use Smalot\PdfParser\Element\ElementName; + +class ElementNameTest extends TestCase +{ + public function testParse(): void + { + // Skipped. + $offset = 0; + $element = ElementName::parse('ABC', null, $offset); + $this->assertFalse($element); + $this->assertEquals(0, $offset); + $offset = 0; + $element = ElementName::parse(' [ /ABC 5 6 ]', null, $offset); + $this->assertFalse($element); + $this->assertEquals(0, $offset); + $offset = 0; + $element = ElementName::parse(' << invalid >>', null, $offset); + $this->assertFalse($element); + $this->assertEquals(0, $offset); + $offset = 0; + $element = ElementName::parse(' / FlateDecode ', null, $offset); + $this->assertFalse($element); + $this->assertEquals(0, $offset); + $offset = 0; + $element = ElementName::parse(' 0 /FlateDecode ', null, $offset); + $this->assertFalse($element); + $this->assertEquals(0, $offset); + $offset = 0; + $element = ElementName::parse(" 0 \n /FlateDecode ", null, $offset); + $this->assertFalse($element); + $this->assertEquals(0, $offset); + + // Valid. + $offset = 0; + $element = ElementName::parse(' /FlateDecode ', null, $offset); + $this->assertEquals('FlateDecode', $element->getContent()); + $this->assertEquals(13, $offset); + + $offset = 0; + $element = ElementName::parse(' /FlateDecode', null, $offset); + $this->assertEquals('FlateDecode', $element->getContent()); + $this->assertEquals(13, $offset); + + $offset = 0; + $element = ElementName::parse('/FlateDecode', null, $offset); + $this->assertEquals('FlateDecode', $element->getContent()); + $this->assertEquals(12, $offset); + + $offset = 0; + $element = ElementName::parse(" \n /FlateDecode ", null, $offset); + $this->assertEquals('FlateDecode', $element->getContent()); + $this->assertEquals(15, $offset); + + $offset = 0; + $element = ElementName::parse('/FlateDecode2', null, $offset); + $this->assertEquals('FlateDecode2', $element->getContent()); + $this->assertEquals(13, $offset); + + $offset = 0; + $element = ElementName::parse('/Flate-Decode2', null, $offset); + $this->assertEquals('Flate-Decode2', $element->getContent()); + $this->assertEquals(14, $offset); + + $offset = 0; + $element = ElementName::parse('/OJHCYD+Cambria', null, $offset); + $this->assertEquals('OJHCYD+Cambria', $element->getContent()); + $this->assertEquals(15, $offset); + + $offset = 0; + $element = ElementName::parse('/OJHCYD+Cambria,Bold', null, $offset); + $this->assertEquals('OJHCYD+Cambria,Bold', $element->getContent()); + $this->assertEquals(20, $offset); + + $offset = 0; + $element = ElementName::parse('/Flate_Decode2', null, $offset); + $this->assertEquals('Flate', $element->getContent()); + $this->assertEquals(6, $offset); + + $offset = 0; + $element = ElementName::parse('/Flate.Decode2', null, $offset); + $this->assertEquals('Flate.Decode2', $element->getContent()); + $this->assertEquals(14, $offset); + } + + public function testGetContent(): void + { + $element = new ElementName('FlateDecode'); + $this->assertEquals('FlateDecode', $element->getContent()); + } + + public function testEquals(): void + { + $element = new ElementName('FlateDecode'); + $this->assertTrue($element->equals('FlateDecode')); + $this->assertFalse($element->equals('Flatedecode')); + + $element = new ElementName('FlateDecode2'); + $this->assertTrue($element->equals('FlateDecode2')); + $this->assertFalse($element->equals('FlateDecode3')); + + $element = new ElementName('Flate-Decode2'); + $this->assertTrue($element->equals('Flate-Decode2')); + $this->assertFalse($element->equals('Flate-Decode3')); + } + + public function testContains(): void + { + $element = new ElementName('FlateDecode'); + $this->assertTrue($element->contains('FlateDecode')); + $this->assertFalse($element->contains('Flatedecode')); + + $element = new ElementName('FlateDecode2'); + $this->assertTrue($element->contains('FlateDecode2')); + $this->assertFalse($element->contains('FlateDecode3')); + + $element = new ElementName('Flate-Decode2'); + $this->assertTrue($element->contains('Flate-Decode2')); + $this->assertFalse($element->contains('Flate-Decode3')); + } + + public function testToString(): void + { + $element = new ElementName('FlateDecode'); + $this->assertEquals('FlateDecode', (string) $element); + } +} diff --git a/tests/PHPUnit/Integration/Element/ElementNullTest.php b/tests/PHPUnit/Integration/Element/ElementNullTest.php new file mode 100644 index 00000000..4a70a81c --- /dev/null +++ b/tests/PHPUnit/Integration/Element/ElementNullTest.php @@ -0,0 +1,131 @@ + + * + * @date 2020-06-02 + * + * @author Sébastien MALOT + * + * @date 2017-01-03 + * + * @license LGPLv3 + * + * @url + * + * PdfParser is a pdf library written in PHP, extraction oriented. + * Copyright (C) 2017 - Sébastien MALOT + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program. + * If not, see . + */ + +namespace PHPUnitTests\Integration\Element; + +use PHPUnitTests\TestCase; +use Smalot\PdfParser\Element\ElementNull; + +class ElementNullTest extends TestCase +{ + public function testParse(): void + { + // Skipped. + $offset = 0; + $element = ElementNull::parse('ABC', null, $offset); + $this->assertFalse($element); + $this->assertEquals(0, $offset); + + $offset = 0; + $element = ElementNull::parse(' [ null ]', null, $offset); + $this->assertFalse($element); + $this->assertEquals(0, $offset); + + $offset = 0; + $element = ElementNull::parse(' << null >>', null, $offset); + $this->assertFalse($element); + $this->assertEquals(0, $offset); + + $offset = 0; + $element = ElementNull::parse(' / null ', null, $offset); + $this->assertFalse($element); + $this->assertEquals(0, $offset); + + $offset = 0; + $element = ElementNull::parse(' 0 null ', null, $offset); + $this->assertFalse($element); + $this->assertEquals(0, $offset); + + $offset = 0; + $element = ElementNull::parse(" 0 \n null ", null, $offset); + $this->assertFalse($element); + $this->assertEquals(0, $offset); + + // Valid. + $offset = 0; + $element = ElementNull::parse(' null ', null, $offset); + $this->assertTrue(null === $element->getContent()); + $this->assertEquals(5, $offset); + + $offset = 0; + $element = ElementNull::parse(' null ', null, $offset); + $this->assertTrue(null === $element->getContent()); + $this->assertEquals(5, $offset); + + $offset = 0; + $element = ElementNull::parse(' null', null, $offset); + $this->assertTrue(null === $element->getContent()); + $this->assertEquals(5, $offset); + + $offset = 0; + $element = ElementNull::parse('null', null, $offset); + $this->assertTrue(null === $element->getContent()); + $this->assertEquals(4, $offset); + + $offset = 0; + $element = ElementNull::parse(" \n null ", null, $offset); + $this->assertTrue(null === $element->getContent()); + $this->assertEquals(7, $offset); + } + + public function testGetContent(): void + { + $element = new ElementNull(); + $this->assertTrue(null === $element->getContent()); + } + + public function testEquals(): void + { + $element = new ElementNull(); + $this->assertTrue($element->equals(null)); + $this->assertFalse($element->equals(false)); + $this->assertFalse($element->equals(0)); + $this->assertFalse($element->equals(1)); + } + + public function testContains(): void + { + $element = new ElementNull(); + $this->assertTrue($element->contains(null)); + $this->assertFalse($element->contains(false)); + $this->assertFalse($element->contains(0)); + } + + public function testToString(): void + { + $element = new ElementNull(); + $this->assertEquals('null', (string) $element); + } +} diff --git a/tests/PHPUnit/Integration/Element/ElementNumericTest.php b/tests/PHPUnit/Integration/Element/ElementNumericTest.php new file mode 100644 index 00000000..c9b4e7ab --- /dev/null +++ b/tests/PHPUnit/Integration/Element/ElementNumericTest.php @@ -0,0 +1,199 @@ + + * + * @date 2020-06-02 + * + * @author Sébastien MALOT + * + * @date 2017-01-03 + * + * @license LGPLv3 + * + * @url + * + * PdfParser is a pdf library written in PHP, extraction oriented. + * Copyright (C) 2017 - Sébastien MALOT + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program. + * If not, see . + */ + +namespace PHPUnitTests\Integration\Element; + +use PHPUnitTests\TestCase; +use Smalot\PdfParser\Element\ElementNumeric; + +class ElementNumericTest extends TestCase +{ + public function testParse(): void + { + // Skipped. + $offset = 0; + $element = ElementNumeric::parse('ABC', null, $offset); + $this->assertFalse($element); + $this->assertEquals(0, $offset); + + $offset = 0; + $element = ElementNumeric::parse(' [ 2 ]', null, $offset); + $this->assertFalse($element); + $this->assertEquals(0, $offset); + + $offset = 0; + $element = ElementNumeric::parse(' /2', null, $offset); + $this->assertFalse($element); + $this->assertEquals(0, $offset); + + $offset = 0; + $element = ElementNumeric::parse(" /2 \n 2", null, $offset); + $this->assertFalse($element); + $this->assertEquals(0, $offset); + + // Valid. + $offset = 0; + $element = ElementNumeric::parse(' -2', null, $offset); + $this->assertEquals(-2.0, $element->getContent()); + $this->assertEquals(3, $offset); + + $offset = 0; + $element = ElementNumeric::parse('2BC', null, $offset); + $this->assertEquals(2.0, $element->getContent()); + $this->assertEquals(1, $offset); + + $offset = 0; + $element = ElementNumeric::parse(' 2BC', null, $offset); + $this->assertEquals(2.0, $element->getContent()); + $this->assertEquals(2, $offset); + + $offset = 0; + $element = ElementNumeric::parse(' -2BC', null, $offset); + $this->assertEquals(-2.0, $element->getContent()); + $this->assertEquals(3, $offset); + + $offset = 0; + $element = ElementNumeric::parse(' -2', null, $offset); + $this->assertEquals(-2.0, $element->getContent()); + $this->assertEquals(3, $offset); + + $offset = 0; + $element = ElementNumeric::parse(' 2 0 obj', null, $offset); + $this->assertEquals(2.0, $element->getContent()); + $this->assertEquals(2, $offset); + + $offset = 0; + $element = ElementNumeric::parse(" \n -2 ", null, $offset); + $this->assertEquals(-2.0, $element->getContent()); + $this->assertEquals(5, $offset); + } + + public function testGetContent(): void + { + $element = new ElementNumeric('B'); + $this->assertEquals(0.0, $element->getContent()); + + $element = new ElementNumeric('-2.5'); + $this->assertEquals(-2.5, $element->getContent()); + + $element = new ElementNumeric('-2'); + $this->assertEquals(-2.0, $element->getContent()); + + $element = new ElementNumeric(' -2'); + $this->assertEquals(-2.0, $element->getContent()); + + $element = new ElementNumeric('2.5'); + $this->assertEquals(2.5, $element->getContent()); + + $element = new ElementNumeric('2'); + $this->assertEquals(2.0, $element->getContent()); + } + + public function testEquals(): void + { + $element = new ElementNumeric('1'); + $this->assertFalse($element->equals('B')); + $element = new ElementNumeric('1.5'); + $this->assertFalse($element->equals('B')); + + $element = new ElementNumeric('2'); + $this->assertTrue($element->equals('2')); + $element = new ElementNumeric('2'); + $this->assertFalse($element->equals('3')); + + $element = new ElementNumeric('-2'); + $this->assertTrue($element->equals('-2')); + $element = new ElementNumeric('-2'); + $this->assertFalse($element->equals('-3')); + + $element = new ElementNumeric('2.5'); + $this->assertTrue($element->equals('2.5')); + $element = new ElementNumeric('2.5'); + $this->assertFalse($element->equals('3.5')); + + $element = new ElementNumeric('-2.5'); + $this->assertTrue($element->equals('-2.5')); + $element = new ElementNumeric('-2.5'); + $this->assertFalse($element->equals('-3.5')); + } + + public function testContains(): void + { + $element = new ElementNumeric('1'); + $this->assertFalse($element->contains('B')); + $element = new ElementNumeric('1.5'); + $this->assertFalse($element->contains('B')); + + $element = new ElementNumeric('2'); + $this->assertTrue($element->contains('2')); + $element = new ElementNumeric('2'); + $this->assertFalse($element->contains('3')); + + $element = new ElementNumeric('-2'); + $this->assertTrue($element->contains('-2')); + $element = new ElementNumeric('-2'); + $this->assertFalse($element->contains('-3')); + + $element = new ElementNumeric('2.5'); + $this->assertTrue($element->contains('2.5')); + $element = new ElementNumeric('2.5'); + $this->assertFalse($element->contains('3.5')); + + $element = new ElementNumeric('-2.5'); + $this->assertTrue($element->contains('-2.5')); + $element = new ElementNumeric('-2.5'); + $this->assertFalse($element->contains('-3.5')); + } + + public function testToString(): void + { + $element = new ElementNumeric('B'); + $this->assertEquals('0', (string) $element); + $element = new ElementNumeric('1B'); + $this->assertEquals('1', (string) $element); + + $element = new ElementNumeric('2'); + $this->assertEquals('2', (string) $element); + + $element = new ElementNumeric('-2'); + $this->assertEquals('-2', (string) $element); + + $element = new ElementNumeric('2.5'); + $this->assertEquals('2.5', (string) $element); + + $element = new ElementNumeric('-2.5'); + $this->assertEquals('-2.5', (string) $element); + } +} diff --git a/tests/PHPUnit/Integration/Element/ElementStringTest.php b/tests/PHPUnit/Integration/Element/ElementStringTest.php new file mode 100644 index 00000000..61d3ad6b --- /dev/null +++ b/tests/PHPUnit/Integration/Element/ElementStringTest.php @@ -0,0 +1,173 @@ + + * + * @date 2020-06-02 + * + * @author Sébastien MALOT + * + * @date 2017-01-03 + * + * @license LGPLv3 + * + * @url + * + * PdfParser is a pdf library written in PHP, extraction oriented. + * Copyright (C) 2017 - Sébastien MALOT + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program. + * If not, see . + */ + +namespace PHPUnitTests\Integration\Element; + +use PHPUnitTests\TestCase; +use Smalot\PdfParser\Element\ElementString; + +class ElementStringTest extends TestCase +{ + public function testParse(): void + { + // Skipped. + $offset = 0; + $element = ElementString::parse('ABC', null, $offset); + $this->assertFalse($element); + $this->assertEquals(0, $offset); + + $offset = 0; + $element = ElementString::parse(' [ (ABC) 5 6 ]', null, $offset); + $this->assertFalse($element); + $this->assertEquals(0, $offset); + + $offset = 0; + $element = ElementString::parse(' << (invalid) >>', null, $offset); + $this->assertFalse($element); + $this->assertEquals(0, $offset); + + $offset = 0; + $element = ElementString::parse(' / (FlateDecode) ', null, $offset); + $this->assertFalse($element); + $this->assertEquals(0, $offset); + + $offset = 0; + $element = ElementString::parse(' 0 (FlateDecode) ', null, $offset); + $this->assertFalse($element); + $this->assertEquals(0, $offset); + + $offset = 0; + $element = ElementString::parse(" 0 \n (FlateDecode) ", null, $offset); + $this->assertFalse($element); + $this->assertEquals(0, $offset); + + // Valid. + $offset = 0; + $element = ElementString::parse(' (Copyright) ', null, $offset); + $this->assertEquals('Copyright', $element->getContent()); + $this->assertEquals(12, $offset); + + $offset = 0; + $element = ElementString::parse(' (Copyright) ', null, $offset); + $this->assertEquals('Copyright', $element->getContent()); + $this->assertEquals(12, $offset); + + $offset = 0; + $element = ElementString::parse(' (Copyright)', null, $offset); + $this->assertEquals('Copyright', $element->getContent()); + $this->assertEquals(12, $offset); + + $offset = 0; + $element = ElementString::parse('(Copyright)', null, $offset); + $this->assertEquals('Copyright', $element->getContent()); + $this->assertEquals(11, $offset); + + $offset = 0; + $element = ElementString::parse('(Copy-right2)', null, $offset); + $this->assertEquals('Copy-right2', $element->getContent()); + $this->assertEquals(13, $offset); + + $offset = 0; + $element = ElementString::parse(" \n (Copyright) ", null, $offset); + $this->assertEquals('Copyright', $element->getContent()); + $this->assertEquals(14, $offset); + + $offset = 0; + $element = ElementString::parse('()', null, $offset); + $this->assertEquals('', $element->getContent()); + $this->assertEquals(2, $offset); + + /* + * Complex study case : Unicode + octal. + */ + $offset = 0; + $element = ElementString::parse('(ABC\\))', null, $offset); + $this->assertEquals('ABC)', $element->getContent()); + $this->assertEquals(7, $offset); + + $offset = 0; + $element = ElementString::parse("(\xFE\xFF\\000M)", null, $offset); + $this->assertEquals('M', $element->getContent()); + $this->assertEquals(9, $offset); + + $offset = 0; + $element = ElementString::parse('(<20>)', null, $offset); + $this->assertEquals(' ', $element->getContent()); + $this->assertEquals(6, $offset); + + $offset = 0; + $element = ElementString::parse('(Gutter\\ console\\ assembly)', null, $offset); + $this->assertEquals('Gutter console assembly', $element->getContent()); + $this->assertEquals(27, $offset); + } + + public function testGetContent(): void + { + $element = new ElementString('Copyright'); + $this->assertEquals('Copyright', $element->getContent()); + } + + public function testEquals(): void + { + $element = new ElementString('CopyRight'); + $this->assertTrue($element->equals('CopyRight')); + $this->assertFalse($element->equals('Flatedecode')); + + $element = new ElementString('CopyRight2'); + $this->assertTrue($element->equals('CopyRight2')); + $this->assertFalse($element->equals('CopyRight3')); + + $element = new ElementString('Flate-Decode2'); + $this->assertTrue($element->equals('Flate-Decode2')); + $this->assertFalse($element->equals('Flate-Decode3')); + } + + public function testContains(): void + { + $element = new ElementString('CopyRight'); + $this->assertTrue($element->contains('CopyRight')); + $this->assertFalse($element->contains('Copyright')); + + $element = new ElementString('CopyRight2'); + $this->assertTrue($element->contains('CopyRight2')); + $this->assertFalse($element->contains('CopyRight3')); + } + + public function testToString(): void + { + $element = new ElementString('CopyRight'); + $this->assertEquals('CopyRight', (string) $element); + } +} diff --git a/tests/PHPUnit/Integration/Element/ElementStructTest.php b/tests/PHPUnit/Integration/Element/ElementStructTest.php new file mode 100644 index 00000000..aa5c994b --- /dev/null +++ b/tests/PHPUnit/Integration/Element/ElementStructTest.php @@ -0,0 +1,95 @@ + + * + * @date 2020-06-02 + * + * @author Sébastien MALOT + * + * @date 2017-01-03 + * + * @license LGPLv3 + * + * @url + * + * PdfParser is a pdf library written in PHP, extraction oriented. + * Copyright (C) 2017 - Sébastien MALOT + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program. + * If not, see . + */ + +namespace PHPUnitTests\Integration\Element; + +use PHPUnitTests\TestCase; +use Smalot\PdfParser\Element\ElementStruct; +use Smalot\PdfParser\Header; + +class ElementStructTest extends TestCase +{ + public function testParse(): void + { + $document = $this->getDocumentInstance(); + + // Skipped. + $offset = 0; + $element = ElementStruct::parse('ABC', $document, $offset); + $this->assertFalse($element); + $this->assertEquals(0, $offset); + + $offset = 0; + $element = ElementStruct::parse(' [ << /Filter /FlateDecode >> ]', $document, $offset); + $this->assertFalse($element); + $this->assertEquals(0, $offset); + + $offset = 0; + $element = ElementStruct::parse(' / << /Filter /FlateDecode >> ', $document, $offset); + $this->assertFalse($element); + $this->assertEquals(0, $offset); + + $offset = 0; + $element = ElementStruct::parse(' 0 << /Filter /FlateDecode >> ', $document, $offset); + $this->assertFalse($element); + $this->assertEquals(0, $offset); + + $offset = 0; + $element = ElementStruct::parse(" 0 \n << /Filter /FlateDecode >> ", $document, $offset); + $this->assertFalse($element); + $this->assertEquals(0, $offset); + + // Valid. + $offset = 0; + $element = ElementStruct::parse(' << /Filter /FlateDecode >> ', $document, $offset); + $this->assertTrue($element instanceof Header); + $this->assertEquals(27, $offset); + + $offset = 0; + $element = ElementStruct::parse(' << /Filter /FlateDecode >>', $document, $offset); + $this->assertTrue($element instanceof Header); + $this->assertEquals(27, $offset); + + $offset = 0; + $element = ElementStruct::parse('<< /Filter /FlateDecode >>', $document, $offset); + $this->assertTrue($element instanceof Header); + $this->assertEquals(26, $offset); + + $offset = 0; + $element = ElementStruct::parse(" \n << /Filter /FlateDecode >> ", $document, $offset); + $this->assertTrue($element instanceof Header); + $this->assertEquals(29, $offset); + } +} diff --git a/tests/PHPUnit/Integration/Element/ElementXRefTest.php b/tests/PHPUnit/Integration/Element/ElementXRefTest.php new file mode 100644 index 00000000..b420ad9d --- /dev/null +++ b/tests/PHPUnit/Integration/Element/ElementXRefTest.php @@ -0,0 +1,136 @@ + + * + * @date 2020-06-02 + * + * @author Sébastien MALOT + * + * @date 2017-01-03 + * + * @license LGPLv3 + * + * @url + * + * PdfParser is a pdf library written in PHP, extraction oriented. + * Copyright (C) 2017 - Sébastien MALOT + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program. + * If not, see . + */ + +namespace PHPUnitTests\Integration\Element; + +use PHPUnitTests\TestCase; +use Smalot\PdfParser\Element\ElementXRef; + +class ElementXRefTest extends TestCase +{ + public function testParse(): void + { + // Skipped. + $offset = 0; + $element = ElementXRef::parse('ABC', null, $offset); + $this->assertFalse($element); + $this->assertEquals(0, $offset); + + $offset = 0; + $element = ElementXRef::parse(' [ 5 0 R ]', null, $offset); + $this->assertFalse($element); + $this->assertEquals(0, $offset); + + $offset = 0; + $element = ElementXRef::parse(' << 5 0 R >>', null, $offset); + $this->assertFalse($element); + $this->assertEquals(0, $offset); + + $offset = 0; + $element = ElementXRef::parse(' / 5 0 R ', null, $offset); + $this->assertFalse($element); + $this->assertEquals(0, $offset); + + $offset = 0; + $element = ElementXRef::parse(' 0 5 0 R ', null, $offset); + $this->assertFalse($element); + $this->assertEquals(0, $offset); + + $offset = 0; + $element = ElementXRef::parse(" 0 \n 5 0 R ", null, $offset); + $this->assertFalse($element); + $this->assertEquals(0, $offset); + + // Valid. + $offset = 0; + $element = ElementXRef::parse(' 5 0 R ', null, $offset); + $this->assertEquals('5_0', $element->getContent()); + $this->assertEquals(6, $offset); + + $offset = 0; + $element = ElementXRef::parse(' 5 0 R ', null, $offset); + $this->assertEquals('5_0', $element->getContent()); + $this->assertEquals(6, $offset); + + $offset = 0; + $element = ElementXRef::parse(' 5 0 R', null, $offset); + $this->assertEquals('5_0', $element->getContent()); + $this->assertEquals(6, $offset); + + $offset = 0; + $element = ElementXRef::parse('5 0 R', null, $offset); + $this->assertEquals('5_0', $element->getContent()); + $this->assertEquals(5, $offset); + + $offset = 0; + $element = ElementXRef::parse(" \n 5 0 R ", null, $offset); + $this->assertEquals('5_0', $element->getContent()); + $this->assertEquals(8, $offset); + } + + public function testGetContent(): void + { + $element = new ElementXRef('5_0'); + $this->assertEquals('5_0', $element->getContent()); + } + + public function testGetId(): void + { + $element = new ElementXRef('5_0'); + $this->assertEquals('5_0', $element->getId()); + } + + public function testEquals(): void + { + $element = new ElementXRef('5_0'); + $this->assertTrue($element->equals(5)); + $this->assertFalse($element->equals(8)); + $this->assertTrue($element->equals($element)); + } + + public function testContains(): void + { + $element = new ElementXRef('5_0'); + $this->assertTrue($element->contains(5)); + $this->assertFalse($element->contains(8)); + $this->assertTrue($element->contains($element)); + } + + public function testToString(): void + { + $element = new ElementXRef('5_0'); + $this->assertEquals('#Obj#5_0', (string) $element); + } +} diff --git a/tests/PHPUnit/Integration/ElementTest.php b/tests/PHPUnit/Integration/ElementTest.php new file mode 100644 index 00000000..f3f3ceed --- /dev/null +++ b/tests/PHPUnit/Integration/ElementTest.php @@ -0,0 +1,160 @@ + + * + * @date 2020-06-01 + * + * @author Sébastien MALOT + * + * @date 2017-01-03 + * + * @license LGPLv3 + * + * @url + * + * PdfParser is a pdf library written in PHP, extraction oriented. + * Copyright (C) 2017 - Sébastien MALOT + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program. + * If not, see . + */ + +namespace PHPUnitTests\Integration; + +use PHPUnitTests\TestCase; +use Smalot\PdfParser\Element; +use Smalot\PdfParser\Element\ElementArray; +use Smalot\PdfParser\Element\ElementBoolean; +use Smalot\PdfParser\Element\ElementDate; +use Smalot\PdfParser\Element\ElementName; +use Smalot\PdfParser\Element\ElementNull; +use Smalot\PdfParser\Element\ElementNumeric; +use Smalot\PdfParser\Element\ElementString; +use Smalot\PdfParser\Element\ElementXRef; +use Smalot\PdfParser\Header; + +class ElementTest extends TestCase +{ + public function testParse(): void + { + $document = $this->getDocumentInstance(); + + // Only_values = false. + $content = '/NameType /FlateDecode + /Contents[4 0 R 42]/Fonts<>/NullType + null/StringType(hello)/DateType(D:20130901235555+02\'00\')/XRefType 2 0 R + /NumericType 8/HexaType<0020>/BooleanType false + /Space#20Test(Templates)/Hyphen#2DTest(Templates)'; + $offset = 0; + + $elements = Element::parse($content, $document, $offset, false); + + $this->assertTrue(\array_key_exists('NameType', $elements)); + $this->assertTrue($elements['NameType'] instanceof ElementName); + $this->assertEquals('FlateDecode', $elements['NameType']->getContent()); + + $this->assertTrue(\array_key_exists('Contents', $elements)); + $this->assertTrue($elements['Contents'] instanceof ElementArray); + $this->assertTrue($elements['Contents']->contains(42)); + + $this->assertTrue(\array_key_exists('Fonts', $elements)); + $this->assertTrue($elements['Fonts'] instanceof Header); + + $this->assertTrue(\array_key_exists('NullType', $elements)); + $this->assertTrue($elements['NullType'] instanceof ElementNull); + $this->assertEquals('null', (string) $elements['NullType']); + + $this->assertTrue(\array_key_exists('StringType', $elements)); + $this->assertTrue($elements['StringType'] instanceof ElementString); + $this->assertEquals('hello', $elements['StringType']->getContent()); + + $this->assertTrue(\array_key_exists('DateType', $elements)); + $this->assertTrue($elements['DateType'] instanceof ElementDate); + + $this->assertTrue(\array_key_exists('XRefType', $elements)); + $this->assertTrue($elements['XRefType'] instanceof ElementXRef); + $this->assertEquals('2_0', $elements['XRefType']->getId()); + + $this->assertTrue(\array_key_exists('NumericType', $elements)); + $this->assertTrue($elements['NumericType'] instanceof ElementNumeric); + $this->assertEquals('8', (string) $elements['NumericType']); + + $this->assertTrue(\array_key_exists('HexaType', $elements)); + $this->assertTrue($elements['HexaType'] instanceof ElementString); + $this->assertEquals(' ', (string) $elements['HexaType']); + + $this->assertTrue(\array_key_exists('BooleanType', $elements)); + $this->assertTrue($elements['BooleanType'] instanceof ElementBoolean); + $this->assertFalse($elements['BooleanType']->getContent()); + + $this->assertTrue(\array_key_exists('Space Test', $elements)); + + $this->assertTrue(\array_key_exists('Hyphen-Test', $elements)); + + // Only_values = true. + $content = '/NameType /FlateDecode'; + $offset = 0; + $elements = Element::parse($content, $document, $offset, true); + $this->assertEquals(2, \count($elements)); + $this->assertEquals(22, $offset); + + // Test error. + $content = '/NameType /FlateDecode $$$'; + $offset = 0; + $elements = Element::parse($content, $document, $offset, false); + $this->assertEquals(1, \count($elements)); + $this->assertEquals(22, $offset); + $this->assertEquals('NameType', key($elements)); + $this->assertTrue(current($elements) instanceof ElementName); + + $content = '/NameType $$$'; + $offset = 0; + $elements = Element::parse($content, $document, $offset, false); + $this->assertEquals(0, $offset); + $this->assertEquals(0, \count($elements)); + } + + public function testGetContent(): void + { + $element = $this->getElementInstance(42); + $content = $element->getContent(); + $this->assertEquals(42, $content); + + $element = $this->getElementInstance([4, 2]); + $this->assertEquals(2, \count($element->getContent())); + } + + public function testEquals(): void + { + $element = $this->getElementInstance(2); + + $this->assertTrue($element->equals(2)); + } + + public function testContains(): void + { + $element = $this->getElementInstance([$this->getElementInstance(4), $this->getElementInstance(2)]); + + $this->assertTrue($element->contains(2)); + $this->assertFalse($element->contains(8)); + } + + public function testToString(): void + { + $this->assertEquals((string) $this->getElementInstance('2'), '2'); + } +} diff --git a/tests/PHPUnit/Integration/EncodingTest.php b/tests/PHPUnit/Integration/EncodingTest.php new file mode 100644 index 00000000..aa01fac3 --- /dev/null +++ b/tests/PHPUnit/Integration/EncodingTest.php @@ -0,0 +1,130 @@ + + * + * @date 2020-06-01 + * + * @author Sébastien MALOT + * + * @date 2017-01-03 + * + * @license LGPLv3 + * + * @url + * + * PdfParser is a pdf library written in PHP, extraction oriented. + * Copyright (C) 2017 - Sébastien MALOT + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program. + * If not, see . + */ + +namespace PHPUnitTests\Integration; + +use PHPUnitTests\TestCase; +use Smalot\PdfParser\Document; +use Smalot\PdfParser\Element; +use Smalot\PdfParser\Encoding; +use Smalot\PdfParser\Encoding\StandardEncoding; +use Smalot\PdfParser\Exception\EncodingNotFoundException; +use Smalot\PdfParser\Header; +use Smalot\PdfParser\Parser; + +class EncodingTest extends TestCase +{ + protected function setUp(): void + { + parent::setUp(); + + $this->fixture = new Parser(); + } + + public function testGetEncodingClass(): void + { + $header = new Header(['BaseEncoding' => new Element('StandardEncoding')]); + + $encoding = new Encoding(new Document(), $header); + $encoding->init(); + + $this->assertEquals('\\'.StandardEncoding::class, $encoding->__toString()); + } + + /** + * This tests checks behavior if given Encoding class doesn't exist. + * + * Protected method getEncodingClass is called in init and __toString. + * It throws an exception if class is not available. + * Calling init is enough to trigger the exception, but __toString call afterwards + * makes sure that we don't missing it. + */ + public function testInitGetEncodingClassMissingClassException(): void + { + $this->expectException(EncodingNotFoundException::class); + $this->expectExceptionMessage('Missing encoding data for: "invalid"'); + + $header = new Header(['BaseEncoding' => new Element('invalid')]); + + $encoding = new Encoding(new Document(), $header); + $encoding->init(); + + $encoding->__toString(); + } + + /** + * This tests focuses on behavior of Encoding::__toString when running PHP 7.4+ and prior. + * + * Prior PHP 7.4 we expect an empty string to be returned (based on PHP specification). + * PHP 7.4+ we expect an exception to be thrown when class is invalid. + */ + public function testToStringGetEncodingClassMissingClassException(): void + { + // prior to PHP 7.4 toString has to return an empty string. + if (version_compare(\PHP_VERSION, '7.4.0', '<')) { + $header = new Header(['BaseEncoding' => new Element('invalid')]); + + $encoding = new Encoding(new Document(), $header); + + $this->assertEquals('', $encoding->__toString()); + } else { + // PHP 7.4+ + $this->expectException(\Exception::class); + $this->expectExceptionMessage('Missing encoding data for: "invalid"'); + + $header = new Header(['BaseEncoding' => new Element('invalid')]); + + $encoding = new Encoding(new Document(), $header); + + $encoding->__toString(); + } + } + + /** + * Fall back to 'StandardEncoding' when the document has none + * + * @see https://github.com/smalot/pdfparser/issues/665 + */ + public function testEmptyBaseEncodingFallback(): void + { + $filename = $this->rootDir.'/samples/bugs/Issue665.pdf'; + + $document = $this->fixture->parseFile($filename); + $objects = $document->getObjects(); + + $this->assertEquals(25, \count($objects)); + $this->assertArrayHasKey('3_0', $objects); + } +} diff --git a/tests/PHPUnit/Integration/FontTest.php b/tests/PHPUnit/Integration/FontTest.php new file mode 100644 index 00000000..b103fdfc --- /dev/null +++ b/tests/PHPUnit/Integration/FontTest.php @@ -0,0 +1,596 @@ + + * + * @date 2020-06-01 + * + * @author Sébastien MALOT + * + * @date 2017-01-03 + * + * @license LGPLv3 + * + * @url + * + * PdfParser is a pdf library written in PHP, extraction oriented. + * Copyright (C) 2017 - Sébastien MALOT + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program. + * If not, see . + */ + +namespace PHPUnitTests\Integration; + +use PHPUnitTests\TestCase; +use Smalot\PdfParser\Config; +use Smalot\PdfParser\Document; +use Smalot\PdfParser\Element; +use Smalot\PdfParser\Encoding; +use Smalot\PdfParser\Font; +use Smalot\PdfParser\Header; +use Smalot\PdfParser\PDFObject; + +class FontTest extends TestCase +{ + public function testGetName(): void + { + $filename = $this->rootDir.'/samples/Document1_pdfcreator_nocompressed.pdf'; + $parser = $this->getParserInstance(); + $document = $parser->parseFile($filename); + $fonts = $document->getFonts(); + $font = reset($fonts); + + $this->assertEquals('OJHCYD+Cambria,Bold', $font->getName()); + } + + public function testGetType(): void + { + $filename = $this->rootDir.'/samples/Document1_pdfcreator_nocompressed.pdf'; + $parser = $this->getParserInstance(); + $document = $parser->parseFile($filename); + $fonts = $document->getFonts(); + $font = reset($fonts); + + $this->assertEquals('TrueType', $font->getType()); + } + + public function testGetDetails(): void + { + $filename = $this->rootDir.'/samples/Document1_pdfcreator_nocompressed.pdf'; + $parser = $this->getParserInstance(); + $document = $parser->parseFile($filename); + $fonts = $document->getFonts(); + $font = reset($fonts); + $reference = [ + 'Name' => 'OJHCYD+Cambria,Bold', + 'Type' => 'TrueType', + 'Encoding' => 'Ansi', + 'BaseFont' => 'OJHCYD+Cambria,Bold', + 'FontDescriptor' => [ + 'Type' => 'FontDescriptor', + 'FontName' => 'OJHCYD+Cambria,Bold', + 'Flags' => 4, + 'Ascent' => 699, + 'CapHeight' => 699, + 'Descent' => -7, + 'ItalicAngle' => 0, + 'StemV' => 128, + 'MissingWidth' => 658, + ], + 'ToUnicode' => [ + 'Filter' => 'FlateDecode', + 'Length' => 219, + ], + 'FirstChar' => 1, + 'LastChar' => 11, + 'Widths' => [ + 0 => 705, + 1 => 569, + 2 => 469, + 3 => 597, + 4 => 890, + 5 => 531, + 6 => 604, + 7 => 365, + 8 => 220, + 9 => 314, + 10 => 308, + ], + 'Subtype' => 'TrueType', + ]; + $this->assertEquals($reference, $font->getDetails()); + } + + public function testTranslateChar(): void + { + $filename = $this->rootDir.'/samples/Document1_pdfcreator_nocompressed.pdf'; + $parser = $this->getParserInstance(); + $document = $parser->parseFile($filename); + $fonts = $document->getFonts(); + $font = reset($fonts); + + $this->assertEquals('D', $font->translateChar("\x01")); + $this->assertEquals('o', $font->translateChar("\x02")); + $this->assertEquals('c', $font->translateChar("\x03")); + $this->assertEquals('u', $font->translateChar("\x04")); + $this->assertEquals(Font::MISSING, $font->translateChar("\x99")); + } + + /** + * Tests buggy behavior of #364. + * + * In some cases Front::translateChar calls Encoding::__toString, which doesn't exist. + * + * Resulting error: Call to undefined method Smalot\PdfParser\Encoding::__toString() + * + * @see https://github.com/smalot/pdfparser/issues/364 + */ + public function testTranslateCharIssue364(): void + { + /* + * Approach: we provoke the __toString call with a minimal set of input data. + */ + $doc = new Document(); + + $header = new Header(['BaseEncoding' => new Element('StandardEncoding')]); + + $encoding = new Encoding($doc, $header); + $encoding->init(); + + $font = new Font($doc, new Header(['Encoding' => $encoding])); + $font->init(); + + // without the fix from #378, calling translateChar would raise "undefined method" error + $this->assertEquals('?', $font->translateChar('t')); + } + + public function testLoadTranslateTable(): void + { + $document = new Document(); + + $content = '<>'; + $header = Header::parse($content, $document); + $font = new Font($document, $header); + + $content = '/CIDInit /ProcSet findresource begin +14 dict begin +begincmap +/CIDSystemInfo +<< /Registry (Adobe) +/Ordering (UCS) +/Supplement 0 +>> def +/CMapName /Adobe-Identity-UCS def +/CMapType 2 def +1 begincodespacerange +<0000> +endcodespacerange +3 beginbfchar +<0003> <0020> +<000F> <002C> +<0011> <002E> +endbfchar +2 beginbfrange +<0013> <0016> <0030> +<0018> <001C> <0035> +endbfrange +7 beginbfchar +<0023> <0040> +<0026> <0043> +<0028> <0045> +<0030> <004D> +<0033> <0050> +<0035> <0052> +<0039> <0056> +endbfchar +4 beginbfrange +<0044> <004C> <0061> +<004F> <0052> <006C> +<0054> <0059> <0071> +<005B> <005C> <0078> +endbfrange +4 beginbfchar +<0070> <00E9> +<00AB> <2026> +<00B0> <0153> +<00B6> <2019> +endbfchar +1 beginbfrange +<0084> <0086> [<0061> <0071> <0081>] +endbfrange +endcmap +CMapName currentdict /CMap defineresource pop +end +end'; + $unicode = new PDFObject($document, null, $content); + + $document->setObjects(['1_0' => $font, '2_0' => $unicode]); + + $font->init(); + // Test reload + $table = $font->loadTranslateTable(); + + $this->assertEquals(47, \count($table)); + + // Test chars + $this->assertEquals(' ', $table[3]); + $this->assertEquals(',', $table[15]); + $this->assertEquals('.', $table[17]); + $this->assertEquals('@', $table[35]); + $this->assertEquals('V', $table[57]); + + // Test ranges + $this->assertEquals('r', $table[85]); + $this->assertEquals('y', $table[92]); + } + + /** + * Tests loadTranslateTable with a bfrange definition that lists every destination character. + * + * @see https://github.com/smalot/pdfparser/issues/631 + */ + public function testLoadTranslateTableIssue631(): void + { + $document = new Document(); + + $content = '<>'; + $header = Header::parse($content, $document); + $font = new Font($document, $header); + + $content = '/CIDInit /ProcSet findresource begin +1 dict begin +begincmap +/CIDSystemInfo +<< /Registry (Adobe) +/Ordering (UCS) +/Supplement 0 +>> def +/CMapName /Adobe-Identity-UCS def +/CMapType 2 def +1 beginbfrange +<0677> <0689> [ <06B5FE8E> <06B5FE8E> <06B6FE8E> <06B6FE8E> <06B7FE8E> <06B7FE8E> <06B8FE8E> <06B8FE8E> <06F4> <0667> ] +<0690> <0693> [ ] +<0694> <0695> [F<> 123 <>00] +<0696> <0701> [<1> <23> <456> <7890> ] +endbfrange +endcmap +CMapName currentdict /CMap defineresource pop +end +end'; + $unicode = new PDFObject($document, null, $content); + + $document->setObjects(['1_0' => $font, '2_0' => $unicode]); + + $font->init(); + // Test reload + $table = $font->loadTranslateTable(); + + $this->assertEquals(29, \count($table)); + + // Test ranges + $this->assertEquals("\u{FB1F}", $table[0x0677]); + $this->assertEquals("\u{FEDF}\u{0672}", $table[0x0678]); + $this->assertEquals("\u{FEE0}\u{0672}", $table[0x0679]); + $this->assertEquals("\u{FEDF}\u{0673}", $table[0x067A]); + $this->assertEquals("\u{FEE0}\u{0673}", $table[0x067B]); + $this->assertEquals("\u{FEDF}\u{0675}", $table[0x067C]); + $this->assertEquals("\u{FEE0}\u{0675}", $table[0x067D]); + $this->assertEquals("\u{06B5}\u{FE8E}", $table[0x067E]); + $this->assertEquals("\u{06B5}\u{FE8E}", $table[0x067F]); + $this->assertEquals("\u{06B6}\u{FE8E}", $table[0x0680]); + $this->assertEquals("\u{06B6}\u{FE8E}", $table[0x0681]); + $this->assertEquals("\u{06B7}\u{FE8E}", $table[0x0682]); + $this->assertEquals("\u{06B7}\u{FE8E}", $table[0x0683]); + $this->assertEquals("\u{06B8}\u{FE8E}", $table[0x0684]); + $this->assertEquals("\u{06B8}\u{FE8E}", $table[0x0685]); + $this->assertEquals("\u{06F4}", $table[0x0686]); + $this->assertEquals("\u{0667}", $table[0x0687]); + $this->assertEquals("\u{FEDF}\u{FB51}", $table[0x0688]); + $this->assertEquals("\u{FEE0}\u{FB51}", $table[0x0689]); + $this->assertEquals("\u{FFFF}", $table[0x0690]); + $this->assertEquals("\u{FFFF}\u{FFFF}", $table[0x0691]); + $this->assertEquals("\u{FFFF}\u{FFFF}\u{FFFF}", $table[0x0692]); + $this->assertEquals("\u{FFFF}\u{FFFF}\u{FFFF}\u{FFFF}", $table[0x0693]); + $this->assertEquals("\u{0001}", $table[0x0696]); + $this->assertEquals("\u{0023}", $table[0x0697]); + $this->assertEquals("\u{0456}", $table[0x0698]); + $this->assertEquals("\u{7890}", $table[0x0699]); + $this->assertEquals("\u{ABCD}\u{000E}", $table[0x069A]); + $this->assertEquals("\u{F123}\u{0045}", $table[0x069B]); + } + + public function testDecodeHexadecimal(): void + { + $hexa = '<322041>'; + $this->assertEquals('2 A', Font::decodeHexadecimal($hexa)); + $this->assertEquals('2 A', Font::decodeHexadecimal($hexa, false)); + $this->assertEquals('(2 A)', Font::decodeHexadecimal($hexa, true)); + + $hexa = '<003200200041>'; + $this->assertEquals("\x002\x00 \x00A", Font::decodeHexadecimal($hexa)); + $this->assertEquals("\x002\x00 \x00A", Font::decodeHexadecimal($hexa, false)); + $this->assertEquals("(\x002\x00 \x00A)", Font::decodeHexadecimal($hexa, true)); + + $hexa = '<00320020> 8 <0041>'; + $this->assertEquals("\x002\x00 8 \x00A", Font::decodeHexadecimal($hexa)); + $this->assertEquals("\x002\x00 8 \x00A", Font::decodeHexadecimal($hexa, false)); + $this->assertEquals("(\x002\x00 ) 8 (\x00A)", Font::decodeHexadecimal($hexa, true)); + + $hexa = '<3220> 8 <41>'; + $this->assertEquals('2 8 A', Font::decodeHexadecimal($hexa)); + $this->assertEquals('2 8 A', Font::decodeHexadecimal($hexa, false)); + $this->assertEquals('(2 ) 8 (A)', Font::decodeHexadecimal($hexa, true)); + + $hexa = '<00320020005C>-10<0041>'; + $this->assertEquals("\x002\x00 \x00\\-10\x00A", Font::decodeHexadecimal($hexa)); + $this->assertEquals("\x002\x00 \x00\\-10\x00A", Font::decodeHexadecimal($hexa, false)); + $this->assertEquals("(\x002\x00 \x00\\\\)-10(\x00A)", Font::decodeHexadecimal($hexa, true)); + + // If it contents XML, the function need to return the same value. + $hexa = '

Example

'; + $this->assertEquals($hexa, Font::decodeHexadecimal($hexa)); + + // hexadecimal string with a line break should not return the input string + // addressing issue #273: https://github.com/smalot/pdfparser/issues/273 + $hexa = "<0027004c0056005300520051004c0045004c004f004c005d0044006f006d0052001d000300560048005b00570044001000490048004c00550044000f0003001400170003004700480003004900480059004800550048004c00550052000300470048000300\n15001300150013>"; + $this->assertEquals("\x0\x27\x0\x4c\x0\x56\x0\x53\x0\x52\x0\x51\x0\x4c\x0\x45\x0\x4c\x0\x4f\x0\x4c\x0\x5d\x0\x44\x0\x6f\x0\x6d\x0\x52\x0\x1d\x0\x3\x0\x56\x0\x48\x0\x5b\x0\x57\x0\x44\x0\x10\x0\x49\x0\x48\x0\x4c\x0\x55\x0\x44\x0\xf\x0\x3\x0\x14\x0\x17\x0\x3\x0\x47\x0\x48\x0\x3\x0\x49\x0\x48\x0\x59\x0\x48\x0\x55\x0\x48\x0\x4c\x0\x55\x0\x52\x0\x3\x0\x47\x0\x48\x0\x3\x0\x15\x0\x13\x0\x15\x0\x13", Font::decodeHexadecimal($hexa)); + } + + public function testDecodeOctal(): void + { + $this->assertEquals('AB C', Font::decodeOctal('\\101\\102\\040\\103')); + $this->assertEquals('AB CD', Font::decodeOctal('\\101\\102\\040\\103D')); + $this->assertEquals('AB \199', Font::decodeOctal('\\101\\102\\040\\\\199')); + + // Test that series of backslashes of arbitrary length are decoded properly + $this->assertEquals('-', Font::decodeOctal('\\055')); // \055 + $this->assertEquals('\\055', Font::decodeOctal('\\\\055')); // \\055 + $this->assertEquals('\\-', Font::decodeOctal('\\\\\\055')); // \\\055 + $this->assertEquals('\\\\055', Font::decodeOctal('\\\\\\\\055')); // \\\\055 + $this->assertEquals('\\\\-', Font::decodeOctal('\\\\\\\\\\055')); // \\\\\055 + $this->assertEquals('\\\\\\055', Font::decodeOctal('\\\\\\\\\\\\055')); // \\\\\\055 + $this->assertEquals('\\\\\\-', Font::decodeOctal('\\\\\\\\\\\\\\055')); // \\\\\\\055 + + // Make sure we're unescaping ( and ) before returning the escaped + // backslashes to the string + $this->assertEquals('\\(', Font::decodeOctal('\\\\(')); // \\( - nothing to unescape + $this->assertEquals('\\(', Font::decodeOctal('\\\\\\(')); // \\\( - parenthesis unescaped + $this->assertEquals('\\\\(', Font::decodeOctal('\\\\\\\\(')); // \\\\( - nothing to unescape + $this->assertEquals('\\\\(', Font::decodeOctal('\\\\\\\\\\(')); // \\\\\( - parenthesis unescaped + } + + public function testDecodeEntities(): void + { + $this->assertEquals('File Type', Font::decodeEntities('File#20Type')); + $this->assertEquals('File# Ty#pe', Font::decodeEntities('File##20Ty#pe')); + $this->assertEquals('Fi#le#-Ty#p#e ', Font::decodeEntities('Fi#23le##2DTy#p#e ')); + } + + public function testDecodeUnicode(): void + { + $this->assertEquals('AB', Font::decodeUnicode("\xFE\xFF\x00A\x00B")); + } + + public function testDecodeText(): void + { + $filename = $this->rootDir.'/samples/Document1_pdfcreator_nocompressed.pdf'; + $parser = $this->getParserInstance(); + $document = $parser->parseFile($filename); + $fonts = $document->getFonts(); + // Cambria font + $font = reset($fonts); + $commands = [ + [ + 't' => '', + 'c' => "\x01\x02", + ], + [ + 't' => 'n', + 'c' => -10, + ], + [ + 't' => '', + 'c' => "\x03", + ], + [ + 't' => '', + 'c' => "\x04", + ], + [ + 't' => 'n', + 'c' => -100, + ], + [ + 't' => '<', + 'c' => '01020304', + ], + ]; + $this->assertEquals('Docu Docu', $font->decodeText($commands)); + + // Check if ANSI/Unicode detection is working properly + $filename = $this->rootDir.'/samples/bugs/Issue95_ANSI.pdf'; + $parser = $this->getParserInstance(); + $document = $parser->parseFile($filename); + $fonts = $document->getFonts(); + $font = reset($fonts); + $commands = [ + [ + 't' => '<', + 'c' => 'E6F6FC', // ANSI encoded string + ], + ]; + $this->assertEquals('æöü', $font->decodeText($commands)); + } + + /** + * Font could have indirect encoding without `/Type /Encoding` + * which would be instance of PDFObject class (but not Encoding or ElementString). + * + * @see https://github.com/smalot/pdfparser/pull/500 + */ + public function testDecodeTextForFontWithIndirectEncodingWithoutTypeEncoding(): void + { + $filename = $this->rootDir.'/samples/bugs/PullRequest500.pdf'; + $parser = $this->getParserInstance(); + $document = $parser->parseFile($filename); + $pages = $document->getPages(); + $page1 = reset($pages); + $page1Text = $page1->getText(); + $expectedText = "Export\u{a0}transakční\u{a0}historie\n"; + $expectedText .= "Typ\u{a0}produktu:\u{a0}Podnikatelský\u{a0}účet\u{a0}Maxi\n"; + $expectedText .= "Číslo\u{a0}účtu:\u{a0}0000000000/0000\n"; + $expectedText .= "Počáteční\u{a0}zůstatek:\t000\u{a0}000,00\u{a0}Kč\n"; + $expectedText .= "Konečný\u{a0}zůstatek:\t000\u{a0}000,00\u{a0}Kč\n"; + $expectedText .= "Cena\u{a0}za\u{a0}služby"; + + $this->assertEquals($expectedText, trim($page1Text)); + } + + /** + * Tests buggy behavior which lead to: + * + * Call to a member function getFontSpaceLimit() on null + * + * @see https://github.com/smalot/pdfparser/pull/403 + * + * @doesNotPerformAssertions + */ + public function testTriggerGetFontSpaceLimitOnNull(): void + { + // error is triggered, if we set the fourth parameter to null + $font = new Font(new Document(), null, null, new Config()); + + // both functions can trigger the error + $font->decodeText([]); + $font->getTextArray(); + } + + public function testXmlContent(): void + { + $filename = $this->rootDir.'/samples/bugs/Issue18.pdf'; + $parser = $this->getParserInstance(); + $document = $parser->parseFile($filename); + $pages = $document->getPages(); + $text = trim($pages[0]->getText()); + + $this->assertEquals('Example PDF', $text); + } + + /** + * Create an instance of Header containing an instance of Encoding that doesn't have a BaseEncoding. + * Test if the Font won't raise a exception because Encoding don't have BaseEncoding. + */ + public function testEncodingWithoutBaseEncoding(): void + { + $document = new Document(); + $header = new Header(['Encoding' => new Encoding($document)]); + $font = new Font(new Document(), $header); + $font->setTable([]); + $this->assertEquals('?', $font->translateChar('a')); + } + + public function testCalculateTextWidth(): void + { + $filename = $this->rootDir.'/samples/Document1_pdfcreator_nocompressed.pdf'; + $parser = $this->getParserInstance(); + $document = $parser->parseFile($filename); + $fonts = $document->getFonts(); + + $font = $fonts['7_0']; + $widths = $font->getDetails()['Widths']; + $this->assertEquals($widths[0], $font->calculateTextWidth('D')); + $this->assertEquals($widths[10], $font->calculateTextWidth('l')); + + $width = $font->calculateTextWidth('Calibri', $missing); + $this->assertEquals(936, $width); + $this->assertEquals(['C', 'a', 'b', 'r'], $missing); + + $width = $fonts['9_0']->calculateTextWidth('Calibri', $missing); + $this->assertEquals(2573, $width); + $this->assertEquals([], $missing); + } + + public function testDecodeContent(): void + { + /* + * we do this to get into the branch with private method "decodeContentByEncodingElement" in Font.php + */ + $encoding = $this->createMock(Element::class); + $encoding->method('getContent')->willReturn('WinAnsiEncoding'); + $header = new Header(['Encoding' => $encoding]); + + $font = new Font($this->createMock(Document::class), $header); + + // Check that a string with UTF-16BE BOM is decoded directly + $this->assertEquals('ABC', $font->decodeContent("\xFE\xFF\x00\x41\x00\x42\x00\x43")); + } + + /** + * Check behavior if getDetails() does return an array without a Widths-key. + * + * @see https://github.com/smalot/pdfparser/issues/619 + */ + public function testCalculateTextWidthNoWidthsKey(): void + { + $document = $this->createMock(Document::class); + + $header = $this->createMock(Header::class); + $header->method('getDetails')->willReturn([ + 'FirstChar' => '', + 'LastChar' => '', + // 'Widths' key is not set, so without the fix in Font.php a warning would be thrown. + ]); + + $font = new Font($document, $header); + $font->setTable([]); + $width = $font->calculateTextWidth('foo'); + + $this->assertNull($width); + } + + /** + * Check behavior if iconv function gets input which contains illegal characters. + * + * In this test we create a CP1252-encoded string, which contains a character that has no counterpart in UTF-8. + * This way we check if the old code triggers the expected warning: + * + * iconv(): Detected an illegal character in input string + * + * Note: Don't use PHPUnit 10+, because it will hide the warning. + * + * A list of invalid characters can be found here: + * https://www.ibm.com/docs/en/rational-synergy/7.2.1?topic=uc-text-encoding-illegal-character-detection-tool + * + * @see https://github.com/smalot/pdfparser/pull/549 + * @see https://github.com/smalot/pdfparser/pull/580 + */ + public function testDecodeContentIssue549(): void + { + /* + * we do this to get into the branch with private method "decodeContentByEncodingElement" in Font.php + */ + $encoding = $this->createMock(Element::class); + $encoding->method('getContent')->willReturn('WinAnsiEncoding'); + $header = new Header(['Encoding' => $encoding]); + + $font = new Font($this->createMock(Document::class), $header); + + // check result + $this->assertEquals('foobar-', $font->decodeContent("foobar-\x8D")); + } +} diff --git a/tests/PHPUnit/Integration/HeaderTest.php b/tests/PHPUnit/Integration/HeaderTest.php new file mode 100644 index 00000000..c18e7a35 --- /dev/null +++ b/tests/PHPUnit/Integration/HeaderTest.php @@ -0,0 +1,189 @@ + + * + * @date 2020-06-01 + * + * @author Sébastien MALOT + * + * @date 2017-01-03 + * + * @license LGPLv3 + * + * @url + * + * PdfParser is a pdf library written in PHP, extraction oriented. + * Copyright (C) 2017 - Sébastien MALOT + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program. + * If not, see . + */ + +namespace PHPUnitTests\Integration; + +use PHPUnitTests\TestCase; +use Smalot\PdfParser\Element; +use Smalot\PdfParser\Element\ElementMissing; +use Smalot\PdfParser\Element\ElementName; +use Smalot\PdfParser\Header; +use Smalot\PdfParser\Page; +use Smalot\PdfParser\PDFObject; + +/** + * Class Header + */ +class HeaderTest extends TestCase +{ + /** + * Checks that init function is called for each element. + */ + public function testInitHappyPath(): void + { + $element = $this->createMock(Element::class); + $element->expects($this->exactly(1))->method('init'); + + $fixture = new Header([$element]); + $fixture->init(); + } + + /** + * Checks buggy behavior if an element was given which is not of type Element. + * + * Problem was, it always called $element::init(), even if its not an object at all. + * + * @see https://github.com/smalot/pdfparser/issues/367 + * + * @doesNotPerformAssertions + */ + public function testInitInvalidElement(): void + { + $element = false; + + $fixture = new Header([$element]); + $fixture->init(); + } + + public function testParse(): void + { + $document = $this->getDocumentInstance(); + + $content = '<>foo'; + $position = 0; + $header = Header::parse($content, $document, $position); + + $this->assertTrue($header instanceof Header); + $this->assertEquals(27, $position); + $this->assertEquals(2, \count($header->getElements())); + + // No header to parse + $this->assertEquals('Page', (string) $header->get('Type')); + $content = 'foo'; + $position = 0; + $header = Header::parse($content, $document, $position); + + $this->assertTrue($header instanceof Header); + $this->assertEquals(0, $position); + $this->assertEquals(0, \count($header->getElements())); + + $position = 0; + $content = "<>"; + Header::parse($content, $document, $position); + $this->assertEquals(212, $position); + + $position = 0; + $content = '[5 0 R ] foo'; + $header = Header::parse($content, $document, $position); + $this->assertEquals(8, $position); + $this->assertEquals(1, \count($header->getElements())); + } + + public function testGetElements(): void + { + $document = $this->getDocumentInstance(); + + $content = '<>foo'; + $position = 0; + $header = Header::parse($content, $document, $position); + + $elements = $header->getElements(); + $this->assertEquals(2, \count($elements)); + $this->assertTrue(current($elements) instanceof ElementName); + + $types = $header->getElementTypes(); + $this->assertTrue(\is_array($types)); + $this->assertEquals(ElementName::class, $types['Type']); + $this->assertEquals(ElementName::class, $types['Subtype']); + } + + public function testHas(): void + { + $document = $this->getDocumentInstance(); + + $content = '<>foo'; + $position = 0; + $header = Header::parse($content, $document, $position); + + $this->assertTrue($header->has('Type')); + $this->assertTrue($header->has('SubType')); + $this->assertTrue($header->has('Font')); + $this->assertFalse($header->has('Text')); + } + + public function testGet(): void + { + $document = $this->getDocumentInstance(); + + $content = '<>foo'; + $position = 0; + $header = Header::parse($content, $document, $position); + $object = new Page($document, $header); + $document->setObjects(['5_0' => $object]); + + $this->assertTrue($header->get('Type') instanceof ElementName); + $this->assertTrue($header->get('SubType') instanceof ElementName); + $this->assertTrue($header->get('Font') instanceof Page); + $this->assertTrue($header->get('Image') instanceof ElementMissing); + $this->assertTrue($header->get('Resources') instanceof ElementMissing); + + /** + * A double forward slash in the header's content results in a falsy element + * that should be parsed to ElementMissing instead. + * + * @see https://github.com/smalot/pdfparser/pull/525 + */ + $content = '<>foo'; + $position = 0; + $header = Header::parse($content, $document, $position); + + $this->assertTrue($header->get('SubType') instanceof ElementMissing); + } + + public function testResolveXRef(): void + { + $document = $this->getDocumentInstance(); + $content = '<>foo'; + $position = 0; + $header = Header::parse($content, $document, $position); + $object = new Page($document, $header); + $document->setObjects(['5_0' => $object]); + + $this->assertTrue($header->get('Font') instanceof PDFObject); + + $header = $header->get('Resources'); + $this->assertTrue($header instanceof ElementMissing); + } +} diff --git a/tests/PHPUnit/Integration/PDFObjectTest.php b/tests/PHPUnit/Integration/PDFObjectTest.php new file mode 100644 index 00000000..072dfd58 --- /dev/null +++ b/tests/PHPUnit/Integration/PDFObjectTest.php @@ -0,0 +1,578 @@ + + * + * @date 2020-06-01 + * + * @author Sébastien MALOT + * + * @date 2017-01-03 + * + * @license LGPLv3 + * + * @url + * + * PdfParser is a pdf library written in PHP, extraction oriented. + * Copyright (C) 2017 - Sébastien MALOT + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program. + * If not, see . + */ + +namespace PHPUnitTests\Integration; + +use PHPUnitTests\TestCase; +use Smalot\PdfParser\Document; +use Smalot\PdfParser\PDFObject; + +class PDFObjectTest extends TestCase +{ + public const TYPE = 't'; + + public const OPERATOR = 'o'; + + public const COMMAND = 'c'; + + protected function getPdfObjectInstance($document): PDFObject + { + return new PDFObject($document); + } + + public function testGetCommandsText(): void + { + $content = "BT /R14 30 Tf 0.999016 0 0 1 137.4 +342.561 Tm +[(A)-168.854( BC D)-220.905(\\(E\\))20.905<20>] +TJ /R14 17.16 Tf <20> Tj +0.999014 0 0 1 336.84 319.161 Tm T* ( \x00m)Tj +/R14 20.04 Tf +ET Q +q -124.774 124.127 5.64213 5.67154 930.307 4436.95 cm +BI"; + + $sections = $this->getPdfObjectInstance(new Document())->getSectionsText($content); + + $offset = 0; + $parts = []; + foreach ($sections as $section) { + $parts[] = $this->getPdfObjectInstance(new Document())->getCommandsText($section)[0]; + } + + $reference = [ + [ + self::TYPE => '', + self::OPERATOR => 'BT', + self::COMMAND => '', + ], + [ + self::TYPE => '/', + self::OPERATOR => 'Tf', + self::COMMAND => 'R14 30', + ], + [ + self::TYPE => '', + self::OPERATOR => 'Tm', + self::COMMAND => '0.999016 0 0 1 137.4 342.561', + ], + [ + self::TYPE => '[', + self::OPERATOR => 'TJ', + self::COMMAND => [ + [ + self::TYPE => '(', + self::OPERATOR => 'TJ', + self::COMMAND => 'A', + ], + [ + self::TYPE => 'n', + self::OPERATOR => '', + self::COMMAND => '-168.854', + ], + [ + self::TYPE => '(', + self::OPERATOR => 'TJ', + self::COMMAND => ' BC D', + ], + [ + self::TYPE => 'n', + self::OPERATOR => '', + self::COMMAND => '-220.905', + ], + [ + self::TYPE => '(', + self::OPERATOR => 'TJ', + self::COMMAND => '\\(E\\)', + ], + [ + self::TYPE => 'n', + self::OPERATOR => '', + self::COMMAND => '20.905', + ], + [ + self::TYPE => '<', + self::OPERATOR => 'TJ', + self::COMMAND => '20', + ], + ], + ], + [ + self::TYPE => '/', + self::OPERATOR => 'Tf', + self::COMMAND => 'R14 17.16', + ], + [ + self::TYPE => '<', + self::OPERATOR => 'Tj', + self::COMMAND => '20', + ], + [ + self::TYPE => '', + self::OPERATOR => 'Tm', + self::COMMAND => '0.999014 0 0 1 336.84 319.161', + ], + [ + self::TYPE => '', + self::OPERATOR => 'T*', + self::COMMAND => '', + ], + [ + self::TYPE => '(', + self::OPERATOR => 'Tj', + self::COMMAND => " \x00m", + ], + [ + self::TYPE => '/', + self::OPERATOR => 'Tf', + self::COMMAND => 'R14 20.04', + ], + [ + self::TYPE => '', + self::OPERATOR => 'ET', + self::COMMAND => '', + ], + [ + self::TYPE => '', + self::OPERATOR => 'Q', + self::COMMAND => '', + ], + [ + self::TYPE => '', + self::OPERATOR => 'q', + self::COMMAND => '', + ], + [ + self::TYPE => '', + self::OPERATOR => 'cm', + self::COMMAND => '-124.774 124.127 5.64213 5.67154 930.307 4436.95', + ], + ]; + + $this->assertEquals($parts, $reference); + } + + public function testCleanContent(): void + { + $content = '/Shape <>> BT >>BDC +Q +/CS0 cs 1 1 0 scn +1 i +/GS0 gs +BT +/TT0 1 Tf +0.0007 Tc 0.0018 Tw 0 Ts 100 Tz 0 Tr 24 0 0 24 51.3 639.26025 Tm +(Modificatio[ns] au \\(14\\) septembre 2009 ET 2010)Tj +EMC +(ABC) Tj + +[ (a)-4.5(b)6(c)8.8 ( fsdfsdfsdf[]sd) ] TD + +ET +/Shape <>BDC +q +0.03 841'; + + $expected = '_____________________________________ +Q +/CS0 cs 1 1 0 scn +1 i +/GS0 gs +BT +/TT0 1 Tf +0.0007 Tc 0.0018 Tw 0 Ts 100 Tz 0 Tr 24 0 0 24 51.3 639.26025 Tm +(________________________________________________)Tj +___ +(___) Tj + +[_____________________________________] TD + +ET +______________________ +q +0.03 841'; + + $cleaned = $this->getPdfObjectInstance(new Document())->cleanContent($content, '_'); + + $this->assertEquals($cleaned, $expected); + } + + public function testFormatContent(): void + { + $content = '/Shape <>> BT >>BDC Q /CS0 cs 1 1 0 scn 1 i +/GS0 gs BT /TT0 1 Tf 0.0007 Tc 0.0018 Tw 0 Ts 100 Tz 0 Tr 24 0 0 24 51.3 639.26025 Tm +(Modificatio[ns] au \\(14\\) septembre 2009 ET 2010)Tj EMC (ABC) Tj +[ (a)-4.5(b)6(c)8.8 ( fsdfsdfsdf[]sd) ] TJ ET /Shape <>BDC q 0.03 841'; + + $expected = '/Shape <>> BT >>BDC +Q +/CS0 cs +1 1 0 scn +1 i +/GS0 gs +BT +/TT0 1 Tf +0.0007 Tc +0.0018 Tw +0 Ts +100 Tz +0 Tr +24 0 0 24 51.3 639.26025 Tm +(Modificatio[ns] au \\(14\\) septembre 2009 ET 2010)Tj +EMC +(ABC) Tj +[ (a)-4.5(b)6(c)8.8 ( fsdfsdfsdf[]sd) ] TJ +ET +/Shape <>BDC +q +0.03 841'; + + // Normalize line-endings + $expected = str_replace(["\r\n", "\n"], ["\n", "\r\n"], $expected); + + $formatContent = new \ReflectionMethod('Smalot\PdfParser\PDFObject', 'formatContent'); + + // TODO: remove this if-clause when dropping 8.0.x support + // From documentation > http://php.net/manual/en/reflectionproperty.setaccessible.php: + // As of PHP 8.1.0, calling this method has no effect; all properties are accessible by default. + if (version_compare(PHP_VERSION, '8.1.0', '<')) { + $formatContent->setAccessible(true); + } + + $cleaned = $formatContent->invoke($this->getPdfObjectInstance(new Document()), $content); + + $this->assertEquals($expected, $cleaned); + + // Check that binary data is rejected + $content = hex2bin('a670c89d4a324e47'); + + $cleaned = $formatContent->invoke($this->getPdfObjectInstance(new Document()), $content); + + $this->assertEquals('', $cleaned); + + // See: https://github.com/smalot/pdfparser/issues/668 + $filename = $this->rootDir.'/samples/bugs/Issue668.pdf'; + + $parser = $this->getParserInstance(); + $document = $parser->parseFile($filename); + $pages = $document->getPages(); + + // Binary check is done before a regexp that causes an error + $this->assertStringContainsString('Marko Nestorović PR', $pages[0]->getText()); + + // mb_check_encoding(..., 'UTF-8') returns true here, + // necessitating a test for UTF-8 that's more strict + $content = hex2bin('0101010101010101'); + $cleaned = $formatContent->invoke($this->getPdfObjectInstance(new Document()), $content); + + $this->assertEquals('', $cleaned); + } + + /** + * Check that escaped slashes and parentheses are accounted for, + * formatContent would emit a PHP Warning for "regular expression + * is too large" here without fix for issue #709 + * + * @see https://github.com/smalot/pdfparser/issues/709 + */ + public function testFormatContentIssue709() + { + $formatContent = new \ReflectionMethod('Smalot\PdfParser\PDFObject', 'formatContent'); + + // TODO: remove this if-clause when dropping 8.0.x support + // From documentation > http://php.net/manual/en/reflectionproperty.setaccessible.php: + // As of PHP 8.1.0, calling this method has no effect; all properties are accessible by default. + if (version_compare(PHP_VERSION, '8.1.0', '<')) { + $formatContent->setAccessible(true); + } + + $content = '(String \\\\\\(string)Tj '.str_repeat('(Test)Tj ', 4500); + $cleaned = $formatContent->invoke($this->getPdfObjectInstance(new Document()), $content); + + $this->assertStringContainsString('(String \\\\\\(string)Tj'."\r\n", $cleaned); + } + + /** + * Check that inline image data does not corrupt the stream + * + * @see: https://github.com/smalot/pdfparser/issues/691 + */ + public function testFormatContentInlineImages(): void + { + $formatContent = new \ReflectionMethod('Smalot\PdfParser\PDFObject', 'formatContent'); + + // TODO: remove this if-clause when dropping 8.0.x support + // From documentation > http://php.net/manual/en/reflectionproperty.setaccessible.php: + // As of PHP 8.1.0, calling this method has no effect; all properties are accessible by default. + if (version_compare(PHP_VERSION, '8.1.0', '<')) { + $formatContent->setAccessible(true); + } + + $cleaned = $formatContent->invoke( + $this->getPdfObjectInstance(new Document()), + 'BT (This BI /W 258 /H 51 /should not trigger /as a /PDF command) TD ET q 65.30 0 0 18.00 412 707 cm BI /W 544 /H 150 +/BPC 1 /IM true /F [/A85 /Fl] ID Gb"0F_$L6!$j/a\$:ma&h\'JnJJ9S?O_EA-W+%D^ClCH=FP3s5M-gStQm\'5/hc`C?CqL(^pV$_-er6Ik`"-1]Q ;~> EI Q /F002 10.00 Tf 0.00 Tw 0 g' + ); + + // PdfParser should not be fooled by Q's in inline image data; + // Only one 'Q' command should be found + $commandQ = preg_match_all('/Q\r\n/', $cleaned); + $this->assertEquals(1, $commandQ); + + // The 'BI' inside a string should not be interpreted as the + // beginning of an inline image command + $this->assertStringContainsString('(This BI /W 258 /H 51 /should not trigger /as a /PDF command) TD', $cleaned); + + $cleaned = $formatContent->invoke( + $this->getPdfObjectInstance(new Document()), + 'BT (This BI /W 258 /H 51 /should not () \) trigger /as a /PDF command) TD (There is no ID inline image in this data) TD (Nothing but text EI should be found) TD ET' + ); + + $this->assertEquals('BT'."\r\n". +'(This BI /W 258 /H 51 /should not () \) trigger /as a /PDF command) TD'."\r\n". +'(There is no ID inline image in this data) TD'."\r\n". +'(Nothing but text EI should be found) TD'."\r\n". +'ET', $cleaned); + } + + public function testGetSectionsText(): void + { + $content = '/Shape <>BDC +Q +/CS0 cs 1 1 0 scn +1 i +/GS0 gs +BT +/TT0 1 Tf +0.0007 Tc 0.0018 Tw 0 Ts 100 Tz 0 Tr 24 0 0 24 51.3 639.26025 Tm +(Mod BT atio[ns] au \\(14\\) septembre 2009 ET 2010)Tj +EMC +(ABC) Tj + +[ (a)-4.5(b) 6(c)8.8 ( fsdfsdfsdf[ sd) ] TD + +ET +/Shape <>BDC BT /TT1 1.5 Tf (BT )Tj ET +q +0.03 841'; + + $sections = $this->getPdfObjectInstance(new Document())->getSectionsText($content); + + $this->assertEquals( + [ + '/Shape <>BDC', + 'Q', + 'BT', + '/TT0 1 Tf', + '0.0007 Tc', + '0.0018 Tw', + '0 Ts', + '100 Tz', + '0 Tr', + '24 0 0 24 51.3 639.26025 Tm', + '(Mod BT atio[ns] au \\(14\\) septembre 2009 ET 2010)Tj', + 'EMC', + '(ABC) Tj', + '[ (a)-4.5(b) 6(c)8.8 ( fsdfsdfsdf[ sd) ] TD', + 'ET', + '/Shape <>BDC', + 'BT', + '/TT1 1.5 Tf', + '(BT )Tj', + 'ET', + 'q', + ], + $sections + ); + + // Test that a Name containing 'ET' doesn't close a 'BT' block + // See: https://github.com/smalot/pdfparser/issues/474 + $content = 'BT +/FTxkPETkkj 8 Tf +1 0 0 1 535.55 627.4 Tm +(Hello World)TJ +ET'; + + $sections = $this->getPdfObjectInstance(new Document())->getSectionsText($content); + + $this->assertNotEquals('/FTxkP', $sections[0]); + $this->assertNotEquals('/FTxkP', $sections[1]); + } + + public function testParseDictionary(): void + { + $data = '<> /Array[/Parsed /Data/Actual]/Silent<>>>'; + + $dictionary = $this->getPdfObjectInstance(new Document())->parseDictionary($data); + + $this->assertArrayHasKey('ActualText', $dictionary); + $this->assertArrayHasKey('XObject', $dictionary); + $this->assertArrayHasKey('Array', $dictionary); + $this->assertArrayHasKey('Silent', $dictionary); + + $this->assertCount(3, $dictionary['Array']); + + $this->assertEquals('<>', $dictionary['Silent']); + } + + /** + * Tests that graphics position (cm) is taken into account when + * positioning text + * + * @see: https://github.com/smalot/pdfparser/issues/608 + */ + public function testGraphicsPositioning(): void + { + $filename = $this->rootDir.'/samples/bugs/Issue608.pdf'; + + $parser = $this->getParserInstance(); + $document = $parser->parseFile($filename); + $pages = $document->getPages(); + + // The \n is not added if 'cm' commands are ignored + $this->assertStringContainsString("Heading 1 \nLorem ipsum", $pages[0]->getText()); + } + + /** + * Tests that ActualText text is printed for a block instead of the + * contents of the Tj or TJ commands in the block. + * + * @see: https://github.com/smalot/pdfparser/issues/464 + */ + public function testActualText(): void + { + $filename = $this->rootDir.'/samples/bugs/Issue608.pdf'; + + $parser = $this->getParserInstance(); + $document = $parser->parseFile($filename); + $pages = $document->getPages(); + + // An ActualText command subs in the three literal characters + // 'ffi' for the single character ligature here + // In addition, if $last_written_position isn't used to store + // the position to insert, \n's would be erroniously inserted + // on either side of the 'ffi' + $this->assertStringContainsString('efficitur', $pages[0]->getText()); + } + + /** + * Tests for the correct decoding of an Em-dash character in + * certain font contexts + * + * See: https://github.com/smalot/pdfparser/issues/585 + */ + public function testDecodeEmDash(): void + { + $filename = $this->rootDir.'/samples/bugs/Issue585.pdf'; + + $parser = $this->getParserInstance(); + $document = $parser->parseFile($filename); + $pages = $document->getPages(); + + $this->assertStringContainsString('слева по ходу — веревка', $pages[0]->getText()); + } + + /** + * Tests behavior with reversed chars instruction. + * + * @see: https://github.com/smalot/pdfparser/issues/398 + */ + public function testReversedChars(): void + { + $filename = $this->rootDir.'/samples/bugs/Issue398.pdf'; + + $parser = $this->getParserInstance(); + $document = $parser->parseFile($filename); + $pages = $document->getPages(); + + $pageText = $pages[0]->getText(); + + $this->assertStringContainsString('שלומי טסט', $pageText); + $this->assertStringContainsString('בנמל מספנות ישראל.', $pageText); + } + + /** + * Tests that a text stream with an improperly selected font code + * page falls back to one that maps all characters. + * + * @see: https://github.com/smalot/pdfparser/issues/586 + */ + public function testImproperFontFallback(): void + { + $filename = $this->rootDir.'/samples/ImproperFontFallback.pdf'; + + $parser = $this->getParserInstance(); + $document = $parser->parseFile($filename); + $pages = $document->getPages(); + + $this->assertStringContainsString('сделал', $pages[0]->getText()); + } + + /** + * Tests that a font ID containing a hyphen / dash character was + * correctly parsed + * + * @see: https://github.com/smalot/pdfparser/issues/145 + */ + public function testFontIDWithHyphen(): void + { + $pdfObject = $this->getPdfObjectInstance(new Document()); + + $fontCommandHyphen = $pdfObject->getCommandsText('/FID-01 15.00 Tf'); + + $this->assertEquals('/', $fontCommandHyphen[0]['t']); + $this->assertEquals('Tf', $fontCommandHyphen[0]['o']); + $this->assertEquals('FID-01 15.00', $fontCommandHyphen[0]['c']); + } + + /** + * Tests that an invalid command does not cause an error, but just + * returns an empty array + */ + public function testInvalidCommand(): void + { + $pdfObject = $this->getPdfObjectInstance(new Document()); + + $validCommand = $pdfObject->getCommandsText('75 rg'); + + $this->assertEquals('', $validCommand[0]['t']); + $this->assertEquals('rg', $validCommand[0]['o']); + $this->assertEquals('75', $validCommand[0]['c']); + + $invalidCommand = $pdfObject->getCommandsText('75'); + + $this->assertEquals([], $invalidCommand); + } +} diff --git a/tests/PHPUnit/Integration/PageTest.php b/tests/PHPUnit/Integration/PageTest.php new file mode 100644 index 00000000..33751e59 --- /dev/null +++ b/tests/PHPUnit/Integration/PageTest.php @@ -0,0 +1,961 @@ + + * + * @date 2020-06-01 + * + * @author Sébastien MALOT + * + * @date 2017-01-03 + * + * @license LGPLv3 + * + * @url + * + * PdfParser is a pdf library written in PHP, extraction oriented. + * Copyright (C) 2017 - Sébastien MALOT + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program. + * If not, see . + */ + +namespace PHPUnitTests\Integration; + +use PHPUnitTests\TestCase; +use Smalot\PdfParser\Config; +use Smalot\PdfParser\Document; +use Smalot\PdfParser\Element\ElementMissing; +use Smalot\PdfParser\Font; +use Smalot\PdfParser\Page; + +class PageTest extends TestCase +{ + public function testGetFonts(): void + { + // Document with text. + $filename = $this->rootDir.'/samples/Document1_pdfcreator_nocompressed.pdf'; + $parser = $this->getParserInstance(); + $document = $parser->parseFile($filename); + $pages = $document->getPages(); + $page = $pages[0]; + + // the first to load data. + $fonts = $page->getFonts(); + $this->assertTrue(0 < \count($fonts)); + foreach ($fonts as $font) { + $this->assertTrue($font instanceof Font); + } + // the second to use cache. + $fonts = $page->getFonts(); + $this->assertTrue(0 < \count($fonts)); + + // ------------------------------------------------------ + // Document without text. + $filename = $this->rootDir.'/samples/Document3_pdfcreator_nocompressed.pdf'; + $document = $parser->parseFile($filename); + $pages = $document->getPages(); + $page = $pages[0]; + + // the first to load data. + $fonts = $page->getFonts(); + $this->assertEquals(0, \count($fonts)); + // the second to use cache. + $fonts = $page->getFonts(); + $this->assertEquals(0, \count($fonts)); + } + + public function testGetFontsElementMissing(): void + { + $headerResources = $this->getMockBuilder('Smalot\PdfParser\Header') + ->disableOriginalConstructor() + ->getMock(); + + $headerResources->expects($this->once()) + ->method('has') + ->willReturn(true); + + $headerResources->expects($this->once()) + ->method('get') + ->willReturn(new ElementMissing()); + + $header = $this->getMockBuilder('Smalot\PdfParser\Header') + ->disableOriginalConstructor() + ->getMock(); + + $header->expects($this->once()) + ->method('get') + ->willReturn($headerResources); + + $page = new Page(new Document(), $header); + $fonts = $page->getFonts(); + + $this->assertEmpty($fonts); + $this->assertEquals([], $fonts); + } + + public function testGetFont(): void + { + // Document with text. + $filename = $this->rootDir.'/samples/Document1_pdfcreator_nocompressed.pdf'; + $parser = $this->getParserInstance(); + $document = $parser->parseFile($filename); + $pages = $document->getPages(); + $page = $pages[0]; + + // the first to load data. + $font = $page->getFont('R7'); + $this->assertTrue($font instanceof Font); + + $font = $page->getFont('ABC7'); + $this->assertTrue($font instanceof Font); + } + + public function testGetText(): void + { + // Document with text. + $filename = $this->rootDir.'/samples/Document1_pdfcreator_nocompressed.pdf'; + $parser = $this->getParserInstance(); + $document = $parser->parseFile($filename); + $pages = $document->getPages(); + $page = $pages[0]; + $text = $page->getText(); + + $this->assertTrue(150 < \strlen($text)); + $this->assertStringContainsString('Document title', $text); + $this->assertStringContainsString('Lorem ipsum', $text); + + $this->assertStringContainsString('Calibri', $text); + $this->assertStringContainsString('Arial', $text); + $this->assertStringContainsString('Times', $text); + $this->assertStringContainsString('Courier New', $text); + $this->assertStringContainsString('Verdana', $text); + } + + /** + * @group memory-heavy + * + * @see https://github.com/smalot/pdfparser/pull/457 + */ + public function testGetTextPullRequest457(): void + { + // Document with text. + $filename = $this->rootDir.'/samples/bugs/PullRequest457.pdf'; + $parser = $this->getParserInstance(); + $document = $parser->parseFile($filename); + $pages = $document->getPages(); + $page = $pages[0]; + $text = $page->getText(); + + $this->assertTrue(1000 < \strlen($text)); + $this->assertStringContainsString('SUPER', $text); + $this->assertStringContainsString('VOORDEEL', $text); + $this->assertStringContainsString('KRANT', $text); + $this->assertStringContainsString('DINSDAG', $text); + $this->assertStringContainsString('Snelfilterkoffie', $text); + $this->assertStringContainsString('Aardappelen'."\n".'Zak', $text); + $this->assertStringContainsString('ALL', $text); + } + + public function testExtractRawData(): void + { + // Document with text. + $filename = $this->rootDir.'/samples/Document1_pdfcreator_nocompressed.pdf'; + $parser = $this->getParserInstance(); + $document = $parser->parseFile($filename); + $pages = $document->getPages(); + $page = $pages[0]; + $extractedRawData = $page->extractRawData(); + + $btItem = $extractedRawData[4]; + $this->assertCount(3, $btItem); + $this->assertArrayHasKey('t', $btItem); + $this->assertArrayHasKey('o', $btItem); + $this->assertArrayHasKey('c', $btItem); + + $this->assertEquals('BT', $btItem['o']); + + $tmItem = $extractedRawData[6]; + + $this->assertcount(185, $extractedRawData); + $this->assertCount(3, $tmItem); + + $this->assertArrayHasKey('t', $tmItem); + $this->assertArrayHasKey('o', $tmItem); + $this->assertArrayHasKey('c', $tmItem); + + $this->assertStringContainsString('Tm', $tmItem['o']); + $this->assertStringContainsString('0.999429 0 0 1 201.96 720.68', $tmItem['c']); + } + + public function testExtractDecodedRawData(): void + { + // Document with text. + $filename = $this->rootDir.'/samples/Document1_pdfcreator_nocompressed.pdf'; + $parser = $this->getParserInstance(); + $document = $parser->parseFile($filename); + $pages = $document->getPages(); + $page = $pages[0]; + $extractedDecodedRawData = $page->extractDecodedRawData(); + $tmItem = $extractedDecodedRawData[6]; + $this->assertCount(185, $extractedDecodedRawData); + $this->assertCount(3, $tmItem); + + $this->assertArrayHasKey('t', $tmItem); + $this->assertArrayHasKey('o', $tmItem); + $this->assertArrayHasKey('c', $tmItem); + + $this->assertStringContainsString('Tm', $tmItem['o']); + $this->assertStringContainsString('0.999429 0 0 1 201.96 720.68', $tmItem['c']); + + $this->assertCount(3, $tmItem); + $this->assertArrayHasKey('t', $tmItem); + $this->assertArrayHasKey('o', $tmItem); + $this->assertArrayHasKey('c', $tmItem); + + $tjItem = $extractedDecodedRawData[7]; + $this->assertStringContainsString('TJ', $tjItem['o']); + $this->assertStringContainsString('(', $tjItem['c'][0]['t']); + $this->assertStringContainsString('D', $tjItem['c'][0]['c']); + $this->assertStringContainsString('n', $tjItem['c'][1]['t']); + $this->assertStringContainsString('0.325008', $tjItem['c'][1]['c']); + $this->assertStringContainsString('(', $tjItem['c'][2]['t']); + $this->assertStringContainsString('o', $tjItem['c'][2]['c']); + } + + public function testExtractRawDataWithCorruptedPdf(): void + { + $this->expectException(\Exception::class); + $this->expectExceptionMessage('Unable to find xref (PDF corrupted?)'); + + $this + ->getParserInstance() + ->parseFile($this->rootDir.'/samples/corrupted.pdf') + ->getPages(); + } + + public function testGetDataCommands(): void + { + // Document with text. + $filename = $this->rootDir.'/samples/Document1_pdfcreator_nocompressed.pdf'; + $parser = $this->getParserInstance(); + $document = $parser->parseFile($filename); + $pages = $document->getPages(); + $page = $pages[0]; + $dataCommands = $page->getDataCommands(); + $this->assertCount(185, $dataCommands); + + $tmItem = $dataCommands[6]; + $this->assertCount(3, $tmItem); + $this->assertArrayHasKey('t', $tmItem); + $this->assertArrayHasKey('o', $tmItem); + $this->assertArrayHasKey('c', $tmItem); + + $this->assertStringContainsString('Tm', $tmItem['o']); + $this->assertStringContainsString('0.999429 0 0 1 201.96 720.68', $tmItem['c']); + + $tjItem = $dataCommands[7]; + $this->assertCount(3, $tjItem); + $this->assertArrayHasKey('t', $tjItem); + $this->assertArrayHasKey('o', $tjItem); + $this->assertArrayHasKey('c', $tjItem); + + $this->assertStringContainsString('TJ', $tjItem['o']); + $this->assertStringContainsString('(', $tjItem['c'][0]['t']); + $this->assertStringContainsString('D', $tjItem['c'][0]['c']); + $this->assertStringContainsString('n', $tjItem['c'][1]['t']); + $this->assertStringContainsString('0.325008', $tjItem['c'][1]['c']); + $this->assertStringContainsString('(', $tjItem['c'][2]['t']); + $this->assertStringContainsString('o', $tjItem['c'][2]['c']); + } + + public function testGetDataTm(): void + { + // Document with text. + $filename = $this->rootDir.'/samples/Document1_pdfcreator_nocompressed.pdf'; + $parser = $this->getParserInstance(); + $document = $parser->parseFile($filename); + $pages = $document->getPages(); + $page = $pages[0]; + + $dataTm = $page->getDataTm(); + + $this->assertCount(81, $dataTm); + + $item = $dataTm[0]; + $this->assertCount(2, $item); + $this->assertCount(6, $item[0]); + $this->assertEquals( + [ + '0.999429', + '0', + '0', + '1', + '201.96', + '720.68', + ], + [ + round($item[0][0], 6), + round($item[0][1], 6), + round($item[0][2], 6), + round($item[0][3], 6), + round($item[0][4], 2), + round($item[0][5], 2), + ] + ); + $this->assertStringContainsString('Document title', $item[1]); + + $item = $dataTm[2]; + $this->assertEquals( + [ + '0.999402', + '0', + '0', + '1', + '70.8', + '673.64', + ], + [ + round($item[0][0], 6), + round($item[0][1], 6), + round($item[0][2], 6), + round($item[0][3], 6), + round($item[0][4], 2), + round($item[0][5], 2), + ] + ); + $this->assertStringContainsString('Calibri : Lorem ipsum dolor sit amet, consectetur a', $item[1]); + + $item = $dataTm[80]; + $this->assertEquals( + [ + '0.999402', + '0', + '0', + '1', + '342.84', + '81.44', + ], + [ + round($item[0][0], 6), + round($item[0][1], 6), + round($item[0][2], 6), + round($item[0][3], 6), + round($item[0][4], 2), + round($item[0][5], 2), + ] + ); + $this->assertStringContainsString('nenatis.', $item[1]); + + // ------------------------------------------------------ + // Document is a form + $filename = $this->rootDir.'/samples/SimpleInvoiceFilledExample1.pdf'; + $document = $parser->parseFile($filename); + $pages = $document->getPages(); + $page = $pages[0]; + $dataTm = $page->getDataTm(); + $item = $dataTm[2]; + $this->assertCount(105, $dataTm); + $this->assertCount(2, $item); + $this->assertCount(6, $item[0]); + $this->assertEquals( + [ + '1', + '0', + '0', + '1', + '167.3', + '894.58', + ], + $item[0] + ); + $this->assertStringContainsString('MyName MyLastName', $item[1]); + + $item = $dataTm[6]; + $this->assertEquals( + [ + '1', + '0', + '0', + '1', + '681.94', + '877.42', + ], + $item[0] + ); + $this->assertStringContainsString('1/1/2020', $item[1]); + + $item = $dataTm[8]; + $this->assertEquals( + [ + '1', + '0', + '0', + '1', + '174.86', + '827.14', + ], + $item[0] + ); + $this->assertStringContainsString('Purchase 1', $item[1]); + + // ------------------------------------------------------ + // Document is another form of the same type + $filename = $this->rootDir.'/samples/SimpleInvoiceFilledExample2.pdf'; + $document = $parser->parseFile($filename); + $pages = $document->getPages(); + $page = $pages[0]; + $dataTm = $page->getDataTm(); + + $item = $dataTm[2]; + $this->assertCount(105, $dataTm); + $this->assertCount(2, $item); + $this->assertCount(6, $item[0]); + $this->assertEquals( + [ + '1', + '0', + '0', + '1', + '167.3', + '894.58', + ], + $item[0] + ); + $this->assertStringContainsString("Other'sName Other'sLastName", $item[1]); + + $item = $dataTm[6]; + $this->assertEquals( + [ + '1', + '0', + '0', + '1', + '681.94', + '877.42', + ], + $item[0] + ); + $this->assertStringContainsString('2/2/2020', $item[1]); + + $item = $dataTm[8]; + $this->assertEquals( + [ + '1', + '0', + '0', + '1', + '174.86', + '827.14', + ], + $item[0] + ); + $this->assertStringContainsString('Purchase 2', $item[1]); + + // test if scaling by fontSize (Tf, Tfs) and test matrix (Tm) are taken into account + $dataCommands = [ + ['t' => '', 'o' => 'BT', 'c' => ''], // begin text + ['t' => '/', 'o' => 'Tf', 'c' => 'TT0 1'], // set font and scale font by 1 pt + ['t' => '', 'o' => 'Tm', 'c' => '7.5 -0 0 8.5 45.36 791.52'], // additionally scale by 7.5 pt + ['t' => '', 'o' => 'Td', 'c' => '0.568 0'], // move 0.568 * 7.5 pts (7.5 is horizontal scaling) to the right + ['t' => '(', 'o' => 'Tj', 'c' => 'test'], // print "test" + ['t' => '', 'o' => 'TD', 'c' => '-3.5 -1.291'], // move 3.5 * 7.5 pts left, 1.291 * 8.5 (vertical scaling) pts down and set text leading to 9.464 + ['t' => '(', 'o' => 'Tj', 'c' => 'another test'], // print "another test" + ['t' => '', 'o' => '\'', 'c' => 'again a test'], // go to next line and print "again a test" + ['t' => '', 'o' => 'TL', 'c' => '5'], // set text leading by TL + ['t' => '', 'o' => '\'', 'c' => 'the next line'], // go to next line and print "the next line" + ]; + + // verify scaling is taken into account for Td + $dataTm = $page->getDataTm($dataCommands); + $item = $dataTm[0]; + $this->assertEquals( + [ + '7.5', + '-0', + '0', + '8.5', + '49.62', + '791.52', + ], + $item[0] + ); + + // verify scaling is taken into account for TD + $item = $dataTm[1]; + $this->assertEquals( + [ + '7.5', + '-0', + '0', + '8.5', + '23.37', + '780.5465', + ], + $item[0] + ); + + // verify scaling is taken into account for text leading set by TD + $item = $dataTm[2]; + $this->assertEquals( + [ + '7.5', + '-0', + '0', + '8.5', + '23.37', + '769.573', + ], + $item[0] + ); + + // verify scaling is taken into account for text leading set by TL + $item = $dataTm[3]; + $this->assertEquals( + [ + '7.5', + '-0', + '0', + '8.5', + '23.37', + '727.073', + ], + $item[0] + ); + } + + public function testDataTmFontInfoHasToBeIncluded(): void + { + $config = new Config(); + $config->setDataTmFontInfoHasToBeIncluded(true); + + $filename = $this->rootDir.'/samples/Document1_pdfcreator_nocompressed.pdf'; + $parser = $this->getParserInstance($config); + $document = $parser->parseFile($filename); + $pages = $document->getPages(); + $page = $pages[0]; + $dataTm = $page->getDataTm(); + $fonts = $page->getFonts(); + + $item = $dataTm[0]; + $this->assertCount(4, $item); + $this->assertEquals($item[2], 'R7'); + $this->assertEquals($item[3], '27.96'); + $this->assertArrayHasKey('R7', $fonts); + $item = $dataTm[80]; + $this->assertCount(4, $item); + $this->assertEquals($item[2], 'R14'); + $this->assertEquals($item[3], '11.04'); + $this->assertArrayHasKey('R7', $fonts); + + $filename = $this->rootDir.'/samples/InternationalChars.pdf'; + $document = $parser->parseFile($filename); + $pages = $document->getPages(); + $page = $pages[0]; + $dataTm = $page->getDataTm(); + $fonts = $page->getFonts(); + + $item = $dataTm[88]; + $this->assertEquals($item[2], 'C2_0'); + $this->assertEquals($item[3], '1'); + $this->assertArrayHasKey('C2_0', $fonts); + foreach ($dataTm as $item) { + $this->assertCount(4, $item); + } + } + + /** + * Tests getDataTm with hexadecimal encoded document text. + * + * @see https://github.com/smalot/pdfparser/issues/336 + */ + public function testGetDataTmIssue336(): void + { + $filename = $this->rootDir.'/samples/bugs/Issue336_decode_hexadecimal.pdf'; + $document = $this->getParserInstance()->parseFile($filename); + $pages = $document->getPages(); + $page = $pages[0]; + $dataTm = $page->getDataTm(); + + $item = $dataTm[2]; + $this->assertCount(13, $dataTm); + $this->assertCount(2, $item); + $this->assertCount(6, $item[0]); + $this->assertEquals( + [ + '1', + '0', + '0', + '1', + '318.185', + '665.044', + ], + $item[0] + ); + $this->assertEquals('Lorem', $item[1]); + } + + /** + * Tests that getPages() only returns Page objects + * + * @see https://github.com/smalot/pdfparser/issues/331 + * + * Sample pdf file provided by @Reqrefusion, see + * https://github.com/smalot/pdfparser/pull/350#issuecomment-703195220 + */ + public function testGetPages(): void + { + $filename = $this->rootDir.'/samples/bugs/Issue331.pdf'; + $document = $this->getParserInstance()->parseFile($filename); + $pages = $document->getPages(); + + /* + * The problem of issue #331 is fixed by the pull request of the issue #479. + * The original Issue331.pdf was modified so for the updated version (actual + * version) a new xref was added and now the valid /Index has the following value: + * [1 1 3 1 7 1 175 1 178 1 219 2] + * This means, that there a 6 pairs containing the values for 'first object id' + * and 'number of objects'. Till now only the first entry was used and so the + * objects of all following entries gots a wrong id. + * By the fix of issue #479 now the expected number of pages is counted. + */ + $this->assertCount(3, $pages); + + foreach ($pages as $page) { + $this->assertTrue($page instanceof Page); + } + } + + public function testGetTextXY(): void + { + // Document with text. + $filename = $this->rootDir.'/samples/Document1_pdfcreator_nocompressed.pdf'; + $parser = $this->getParserInstance(); + $document = $parser->parseFile($filename); + $pages = $document->getPages(); + $page = $pages[0]; + $result = $page->getTextXY(201.96, 720.68, 0.01, 0.01); + $this->assertCount(1, $result); + $this->assertCount(2, $result[0]); + $this->assertEquals( + [ + '0.999429', + '0', + '0', + '1', + '201.96', + '720.68', + ], + [ + round($result[0][0][0], 6), + round($result[0][0][1], 6), + round($result[0][0][2], 6), + round($result[0][0][3], 6), + round($result[0][0][4], 2), + round($result[0][0][5], 2), + ] + ); + $this->assertStringContainsString('Document title', $result[0][1]); + + $result = $page->getTextXY(201, 720); + $this->assertCount(0, $result); + + $result = $page->getTextXY(201, 720, 1, 1); + $this->assertCount(1, $result); + $this->assertCount(2, $result[0]); + $this->assertEquals( + [ + '0.999429', + '0', + '0', + '1', + '201.96', + '720.68', + ], + [ + round($result[0][0][0], 6), + round($result[0][0][1], 6), + round($result[0][0][2], 6), + round($result[0][0][3], 6), + round($result[0][0][4], 2), + round($result[0][0][5], 2), + ] + ); + $this->assertStringContainsString('Document title', $result[0][1]); + + // ------------------------------------------------------ + // Document is a form + $filename = $this->rootDir.'/samples/SimpleInvoiceFilledExample1.pdf'; + $document = $parser->parseFile($filename); + $pages = $document->getPages(); + $page = $pages[0]; + $result = $page->getTextXY(167, 894, 1, 1); + $this->assertCount(1, $result); + $this->assertCount(2, $result[0]); + $this->assertEquals( + [ + '1', + '0', + '0', + '1', + '167.3', + '894.58', + ], + $result[0][0] + ); + $this->assertStringContainsString('MyName MyLastName', $result[0][1]); + + $result = $page->getTextXY(681, 877, 1, 1); + $this->assertStringContainsString('1/1/2020', $result[0][1]); + + $result = $page->getTextXY(174, 827, 1, 1); + $this->assertStringContainsString('Purchase 1', $result[0][1]); + + // ------------------------------------------------------ + // Document is another form of the same type + $filename = $this->rootDir.'/samples/SimpleInvoiceFilledExample2.pdf'; + $document = $parser->parseFile($filename); + $pages = $document->getPages(); + $page = $pages[0]; + $result = $page->getTextXY(167, 894, 1, 1); + $this->assertEquals( + [ + '1', + '0', + '0', + '1', + '167.3', + '894.58', + ], + $result[0][0] + ); + $this->assertStringContainsString("Other'sName Other'sLastName", $result[0][1]); + + $result = $page->getTextXY(681, 877, 1, 1); + $this->assertStringContainsString('2/2/2020', $result[0][1]); + + $result = $page->getTextXY(174, 827, 1, 1); + $this->assertStringContainsString('Purchase 2', $result[0][1]); + } + + public function testExtractDecodedRawDataIssue450(): void + { + $filename = $this->rootDir.'/samples/bugs/Issue450.pdf'; + $parser = $this->getParserInstance(); + $document = $parser->parseFile($filename); + $pages = $document->getPages(); + $page = $pages[0]; + $extractedDecodedRawData = $page->extractDecodedRawData(); + $this->assertIsArray($extractedDecodedRawData); + $this->assertGreaterThan(3, \count($extractedDecodedRawData)); + $this->assertIsArray($extractedDecodedRawData[3]); + $this->assertEquals('TJ', $extractedDecodedRawData[3]['o']); + $this->assertIsArray($extractedDecodedRawData[3]['c']); + $this->assertIsArray($extractedDecodedRawData[3]['c'][0]); + $this->assertEquals(3, \count($extractedDecodedRawData[3]['c'][0])); + $this->assertEquals('{signature:signer505906:Please+Sign+Here}', $extractedDecodedRawData[3]['c'][0]['c']); + } + + public function testGetDataTmIssue450(): void + { + $filename = $this->rootDir.'/samples/bugs/Issue450.pdf'; + $parser = $this->getParserInstance(); + $document = $parser->parseFile($filename); + $pages = $document->getPages(); + $page = $pages[0]; + $dataTm = $page->getDataTm(); + $this->assertIsArray($dataTm); + $this->assertEquals(1, \count($dataTm)); + $this->assertIsArray($dataTm[0]); + $this->assertEquals(2, \count($dataTm[0])); + $this->assertIsArray($dataTm[0][0]); + $this->assertEquals(6, \count($dataTm[0][0])); + $this->assertEquals(1, $dataTm[0][0][0]); + $this->assertEquals(0, $dataTm[0][0][1]); + $this->assertEquals(0, $dataTm[0][0][2]); + $this->assertEquals(1, $dataTm[0][0][3]); + $this->assertEquals(67.5, $dataTm[0][0][4]); + $this->assertEquals(756.25, $dataTm[0][0][5]); + $this->assertEquals('{signature:signer505906:Please+Sign+Here}', $dataTm[0][1]); + } + + public function testIsFpdf(): void + { + $filename = $this->rootDir.'/samples/Document1_foxitreader.pdf'; + $parser = $this->getParserInstance(); + $document = $parser->parseFile($filename); + $pages = $document->getPages(); + $page = $pages[0]; + $this->assertFalse($page->isFpdf()); + $filename = $this->rootDir.'/samples/bugs/Issue454.pdf'; + $document = $parser->parseFile($filename); + $pages = $document->getPages(); + $page = $pages[0]; + $this->assertTrue($page->isFpdf()); + } + + public function testGetPageNumber(): void + { + $filename = $this->rootDir.'/samples/Document1_foxitreader.pdf'; + $parser = $this->getParserInstance(); + $document = $parser->parseFile($filename); + $pages = $document->getPages(); + $page = $pages[0]; + $this->assertEquals(0, $page->getPageNumber()); + $filename = $this->rootDir.'/samples/Document1_pdfcreator.pdf'; + $document = $parser->parseFile($filename); + $pages = $document->getPages(); + $page = $pages[0]; + $this->assertEquals(0, $page->getPageNumber()); + $filename = $this->rootDir.'/samples/Document2_pdfcreator_nocompressed.pdf'; + $document = $parser->parseFile($filename); + $pages = $document->getPages(); + $page = $pages[0]; + $this->assertEquals(0, $page->getPageNumber()); + $filename = $this->rootDir.'/samples/InternationalChars.pdf'; + $document = $parser->parseFile($filename); + $pages = $document->getPages(); + $page = $pages[0]; + $this->assertEquals(0, $page->getPageNumber()); + $filename = $this->rootDir.'/samples/SimpleInvoiceFilledExample1.pdf'; + $document = $parser->parseFile($filename); + $pages = $document->getPages(); + $page = $pages[0]; + $this->assertEquals(0, $page->getPageNumber()); + $filename = $this->rootDir.'/samples/bugs/Issue454.pdf'; + $document = $parser->parseFile($filename); + $pages = $document->getPages(); + $page = $pages[0]; + $this->assertEquals(0, $page->getPageNumber()); + $page = $pages[1]; + $this->assertEquals(1, $page->getPageNumber()); + $page = $pages[2]; + $this->assertEquals(2, $page->getPageNumber()); + $page = $pages[3]; + $this->assertEquals(3, $page->getPageNumber()); + } + + public function testIssue454(): void + { + $filename = $this->rootDir.'/samples/bugs/Issue454.pdf'; + $parser = $this->getParserInstance(); + $document = $parser->parseFile($filename); + $pages = $document->getPages(); + $page = $pages[0]; + $dataTm = $page->getDataTm(); + $this->assertIsArray($dataTm); + $this->assertGreaterThan(0, \count($dataTm)); + $this->assertIsArray($dataTm[0]); + $this->assertEquals(2, \count($dataTm[0])); + $this->assertIsArray($dataTm[0][0]); + $this->assertEquals(6, \count($dataTm[0][0])); + $this->assertEquals(201.96, round($dataTm[0][0][4], 2)); + $this->assertEquals(720.68, round($dataTm[0][0][5], 2)); + $this->assertStringContainsString('Document title', $dataTm[0][1]); + $textData = $page->getTextXY(201.96, 720.68, 0.01, 0.01); + $this->assertStringContainsString('Document title', $textData[0][1]); + $page = $pages[2]; + $dataTm = $page->getDataTm(); + $this->assertIsArray($dataTm); + $this->assertGreaterThan(0, \count($dataTm)); + $this->assertIsArray($dataTm[0]); + $this->assertEquals(2, \count($dataTm[0])); + $this->assertIsArray($dataTm[0][0]); + $this->assertEquals(6, \count($dataTm[0][0])); + $this->assertEquals(67.5, $dataTm[0][0][4]); + $this->assertEquals(756.25, $dataTm[0][0][5]); + $this->assertStringContainsString('{signature:signer505906:Please+Sign+Here}', $dataTm[0][1]); + $textData = $page->getTextXY(67.5, 756.25); + $this->assertStringContainsString('{signature:signer505906:Please+Sign+Here}', $textData[0][1]); + } + + /** + * Check that BT and ET do not reset the font. + * + * Data TM font info is included. + * + * @see https://github.com/smalot/pdfparser/pull/630 + */ + public function testIssue629WithDataTmFontInfo(): void + { + $config = new Config(); + $config->setDataTmFontInfoHasToBeIncluded(true); + + $filename = $this->rootDir.'/samples/bugs/Issue629.pdf'; + $parser = $this->getParserInstance($config); + $document = $parser->parseFile($filename); + $pages = $document->getPages(); + $page = end($pages); + $dataTm = $page->getDataTm(); + + $this->assertCount(4, $dataTm[0]); + $this->assertEquals('F2', $dataTm[0][2]); + } + + /** + * Data TM font info is NOT included. + * + * @see https://github.com/smalot/pdfparser/pull/630 + */ + public function testIssue629WithoutDataTmFontInfo(): void + { + $config = new Config(); + + $filename = $this->rootDir.'/samples/bugs/Issue629.pdf'; + $parser = $this->getParserInstance($config); + $document = $parser->parseFile($filename); + $pages = $document->getPages(); + $page = end($pages); + $dataTm = $page->getDataTm(); + + $this->assertCount(2, $dataTm[0]); + $this->assertFalse(isset($dataTm[0][2])); + } + + public function testCmCommandInPdfs(): void + { + $config = new Config(); + $parser = $this->getParserInstance($config); + $filename = $this->rootDir.'/samples/Document-Word-Landscape-printedaspdf.pdf'; + $document = $parser->parseFile($filename); + $pages = $document->getPages(); + $page = $pages[0]; + $dataTm = $page->getDataTm(); + $item = $dataTm[2]; + $this->assertCount(6, $dataTm); + $this->assertCount(2, $item); + $this->assertCount(6, $item[0]); + $this->assertEquals('This is just a test', trim($item[1])); + $this->assertEquals( + [ + '0.75', + '0.0', + '0.0', + '0.75', + '59.16', + '500.4', + ], + [ + round($item[0][0], 6), + round($item[0][1], 6), + round($item[0][2], 6), + round($item[0][3], 6), + round($item[0][4], 2), + round($item[0][5], 2), + ] + ); + } +} diff --git a/tests/PHPUnit/Integration/PagesTest.php b/tests/PHPUnit/Integration/PagesTest.php new file mode 100644 index 00000000..fb069c08 --- /dev/null +++ b/tests/PHPUnit/Integration/PagesTest.php @@ -0,0 +1,106 @@ + + * + * @date 2024-04-19 + * + * @license LGPLv3 + * + * @url + * + * PdfParser is a pdf library written in PHP, extraction oriented. + * Copyright (C) 2017 - Sébastien MALOT + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program. + * If not, see . + */ + +namespace PHPUnitTests\Integration; + +use PHPUnitTests\TestCase; +use Smalot\PdfParser\Document; +use Smalot\PdfParser\Element\ElementArray; +use Smalot\PdfParser\Font; +use Smalot\PdfParser\Header; +use Smalot\PdfParser\Page; +use Smalot\PdfParser\Pages; + +/** + * @internal only for test purposes + */ +class PagesDummy extends Pages +{ + /** + * The purpose of this function is to bypass the tedious + * work to setup instances which lead to a valid $fonts variable. + * + * @param array<\Smalot\PdfParser\Font> $fonts + * + * @return void + */ + public function setFonts($fonts) + { + $this->fonts = $fonts; + } +} + +class PagesTest extends TestCase +{ + public function testFontsArePassedFromPagesToPage(): void + { + // Create mock Document, Font and Page objects + $document = $this->createMock(Document::class); + $font1 = new Font($document); + $page = new Page($document); + + // Create a Header object that indicates $page is a child + $header = new Header([ + 'Kids' => new ElementArray([ + $page, + ]), + ], $document); + + // Use this header to create a mock Pages object + $pages = new PagesDummy($document, $header); + + // Apply $font1 as a Font object to this Pages object; + // setFonts is used here as part of PagesDummy, only to access + // the protected Pages::fonts variable; it is not a method + // available in production + $pages->setFonts([$font1]); + + // Trigger setupFonts method in $pages + $pages->getPages(true); + + // Since the $page object font list is empty, $font1 from Pages + // object must be passed to the Page object + $this->assertEquals([$font1], $page->getFonts()); + + // Create a second $font2 using a different method + $font2 = $this->createMock(Font::class); + + // Update the fonts in $pages + $pages->setFonts([$font1, $font2]); + + // Trigger setupFonts method in $pages + $pages->getPages(true); + + // Now that $page already has a font, updates from $pages + // should not overwrite it + $this->assertEquals([$font1], $page->getFonts()); + } +} diff --git a/tests/PHPUnit/Integration/ParserTest.php b/tests/PHPUnit/Integration/ParserTest.php new file mode 100644 index 00000000..fa0d3f42 --- /dev/null +++ b/tests/PHPUnit/Integration/ParserTest.php @@ -0,0 +1,454 @@ + + * + * @date 2020-06-01 + * + * @author Sébastien MALOT + * + * @date 2017-01-03 + * + * @license LGPLv3 + * + * @url + * + * PdfParser is a pdf library written in PHP, extraction oriented. + * Copyright (C) 2017 - Sébastien MALOT + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program. + * If not, see . + */ + +namespace PHPUnitTests\Integration; + +use PHPUnitTests\TestCase; +use Smalot\PdfParser\Config; +use Smalot\PdfParser\Document; +use Smalot\PdfParser\Parser; +use Smalot\PdfParser\XObject\Image; + +class ParserTest extends TestCase +{ + protected function setUp(): void + { + parent::setUp(); + + $this->fixture = new Parser(); + } + + /** + * Notice: it may fail to run in Scrutinizer because of memory limitations. + * + * @group memory-heavy + */ + public function testParseFile(): void + { + $directory = $this->rootDir.'/samples/bugs'; + + if (is_dir($directory)) { + $files = scandir($directory); + + foreach ($files as $file) { + if (preg_match('/^.*\.pdf$/i', $file)) { + try { + $document = $this->fixture->parseFile($directory.'/'.$file); + $pages = $document->getPages(); + $this->assertTrue(0 < \count($pages)); + + foreach ($pages as $page) { + $content = $page->getText(); + $this->assertTrue('' !== $content); + } + } catch (\Exception $e) { + if ( + 'Secured pdf file are currently not supported.' !== $e->getMessage() + && 0 != strpos($e->getMessage(), 'TCPDF_PARSER') + ) { + throw $e; + } + } + } + } + } + } + + /** + * Properly decode international unicode characters + * + * @todo the other languages in the test document need work because of issues with UTF-16 decoding (Chinese, Japanese) and missing right-to-left language support + */ + public function testUnicodeDecoding(): void + { + $filename = $this->rootDir.'/samples/InternationalChars.pdf'; + + $document = $this->fixture->parseFile($filename); + + $testString_cyrillic = "Лорем ипсум долор сит амет, еу сед либрис долорем инцоррупте. Ут лорем долоре граеце хис, модо \nаппареат сапиентем ут мел. Хис ат лаборе омнесяуе сигниферумяуе, тале анциллае ан еум, ех сед синт \nнобис. Сед модус вивендо цопиосае еа, сапиентем цонцептам хис не, яуандо сплендиде еум те."; + $testString_greek = "Λορεμ ιπσθμ δολορ σιτ αμετ, τατιον cονστιτθαμ ομιτταντθρ εα σεα, αθδιαμ μανδαμθσ μελ τε. Διcο μθτατ \nινδοcτθμ εοσ ει, ει vιξ σονετ παρτιενδο ινcορρθπτε. Επιcθρι αντιοπαμ εθ νεc, ναμ λεγιμθσ γθβεργρεν ιν. \nVιξ σολετ ρεcτεqθε εα, ηασ νο αλιqθαμ μινιμθμ. Ιδ προ περcιπιτ περιcθλισ δετερρθισσετ, ιν νεc αππετερε \nομιτταντθρ ελοqθεντιαμ, ορατιο δοcτθσ ναμ αδ. Ετ σιτ σολθμ ρεcθσαβο, vιξ θτ λοβορτισ σπλενδιδε \nρεπθδιανδαε."; + $testString_armenian = "լոռեմ իպսում դոլոռ սիթ ամեթ վիս ին իմպեդիթ ադմոդում ծու ապպառեաթ սծռիպսեռիթ մել մել եթ \nդոմինգ ծոնսեքուունթուռ ծիվիբուս վիվենդում պռոդեսսեթ ադ մեի թիբիքուե ապպառեաթ սիմիլիքուե թե \nվիմ վիխ ծասե սեմպեռ դոլոռեմ եխ եամ եա սթեթ մեդիոծռեմ ծոնսեթեթուռ ռաթիոնիբուս ինթելլեգամ \nմել թե"; + $testString_georgean = "ლორემ იფსუმ დოლორ სით ამეთ ესთ ეთ სონეთ ზრილ მელიუს ელიგენდი თორყუათოს \nელოყუენთიამ ესთ ეხ უსუ ფალლი ალთერა ცეთეროს ინ ეთ ომითთამ თრაცთათოს ჰის ეუ ველ \nალთერუმ ვოლუფთათუმ მაზიმ ფერთინახ ჰენდრერით ინ ფრი ნეც ინ თემფორ ფეთენთიუმ ვერო \nფოსთულანთ ელოყუენთიამ უსუ ნე ან ყუი ლიბერ ეფიცური ასსუევერით იდ ნიბჰ ყუას ჰაბემუს სეა"; + $testString_korean = "그 임기는 4년으로 하며. 이 경우 그 명령에 의하여 개정 또는 폐지되었던 법률은 그 명령이 승인을 얻지 못한 때부터 당연히 효력을 \n회복한다. 가부동수인 때에는 부결된 것으로 본다. 법률과 적법한 절차에 의하지 아니하고는 처벌·보안처분 또는 강제노역을 받지 \n아니한다."; + $testString_western = 'ÄÖÜöäüßẞ Ññ¡¿ øÅå'; + + $this->assertStringContainsString($testString_cyrillic, $document->getText()); + $this->assertStringContainsString($testString_greek, $document->getText()); + $this->assertStringContainsString($testString_armenian, $document->getText()); + $this->assertStringContainsString($testString_georgean, $document->getText()); + $this->assertStringContainsString($testString_korean, $document->getText()); + $this->assertStringContainsString($testString_western, $document->getText()); + } + + /** + * Tests that xrefs with line breaks between id and position are parsed correctly + * + * @see https://github.com/smalot/pdfparser/issues/336 + */ + public function testIssue19(): void + { + $fixture = new ParserSub(); + $structure = [ + [ + '<<', + [ + [ + '/', + 'Type', + 7735, + ], + [ + '/', + 'ObjStm', + 7742, + ], + ], + ], + [ + 'stream', + '', + 7804, + [ + "17\n0", + [], + ], + ], + ]; + $document = new Document(); + + $fixture->exposedParseObject('19_0', $structure, $document); + $objects = $fixture->getObjects(); + + $this->assertArrayHasKey('17_0', $objects); + } + + /** + * Properly decode ANSI encodings without producing scrambled UTF-8 characters + * + * @see https://github.com/smalot/pdfparser/issues/202 + * @see https://github.com/smalot/pdfparser/pull/257 + */ + public function testIssue202(): void + { + $filename = $this->rootDir.'/samples/bugs/Issue202.pdf'; + + $document = $this->fixture->parseFile($filename); + + $this->assertEquals('„fööbär“', $document->getText()); + } + + /** + * Test that issue related pdf can now be parsed + * + * @see https://github.com/smalot/pdfparser/issues/267 + */ + public function testIssue267(): void + { + $filename = $this->rootDir.'/samples/bugs/Issue267_array_access_on_int.pdf'; + + $document = $this->fixture->parseFile($filename); + + $this->assertEquals(Image::class, \get_class($document->getObjectById('128_0'))); + $this->assertStringContainsString('4 von 4', $document->getText()); + } + + /** + * Test that issue related pdf can now be parsed: + * Too many slashes were being stripped and resulted + * in malformed encoding of parts of the text content. + * + * @see https://github.com/smalot/pdfparser/issues/322 + */ + public function testIssue322(): void + { + $filename = $this->rootDir.'/samples/bugs/Issue322.pdf'; + + $document = $this->fixture->parseFile($filename); + + $this->assertStringContainsString('this text isn’t working properly, I’ve edited it in Google Documents', $document->getText()); + } + + /** + * Test that issue related pdf can now be parsed: + * Too many slashes were being stripped and resulted + * in malformed encoding of parts of the text content. + * + * License of the content taken from https://stackoverflow.com in the sample PDF: + * CC BY-SA 2.5 https://creativecommons.org/licenses/by-sa/2.5/ + * + * @see https://github.com/smalot/pdfparser/issues/334 + */ + public function testIssue334(): void + { + $filename = $this->rootDir.'/samples/bugs/Issue334.pdf'; + + $document = $this->fixture->parseFile($filename); + + $this->assertStringContainsString('This question already has an answer here', $document->getText()); + } + + /** + * Test that issue related pdf can now be parsed: + * Glyphs not in the Postscript lookup table would cause "Notice: Undefined offset" + * + * @see https://github.com/smalot/pdfparser/issues/359 + */ + public function testIssue359(): void + { + $filename = $this->rootDir.'/samples/bugs/Issue359.pdf'; + + $document = $this->fixture->parseFile($filename); + + $this->assertStringContainsString( + 'dnia 10 maja 2018 roku o ochronie danych osobowych', + $document->getText() + ); + $this->assertStringContainsString('sprawie ochrony osób fizycznych w związku', $document->getText()); + /* + * @todo Note that the "ł" in przepływu is decoded as a space character. This was already + * the case before the PR that caused this issue and is not currently covered by this + * test case. However, this issue should be addressed in the future and its fix can then + * be incorporated into this test by uncommenting the following assertion. + */ + // $this->assertStringContainsString('sprawie swobodnego przepływu takich danych oraz uchylenia dyrektywy', $document->getText()); + } + + /** + * Tests if PDF triggers "Call to undefined method Smalot\PdfParser\Header::__toString()". + * + * It happened because there was a check missing in Font.php (~ line 109). + * + * @see https://github.com/smalot/pdfparser/issues/391 + */ + public function testIssue391(): void + { + /** + * PDF provided by @dhildreth for usage in our test environment. + * + * @see https://github.com/smalot/pdfparser/issues/391#issuecomment-783504599 + */ + $filename = $this->rootDir.'/samples/bugs/Issue391.pdf'; + + $document = $this->fixture->parseFile($filename); + + // check for an example string (PDF consists of many pages) + $this->assertStringContainsString( + '(This Code will be changed while mass production)', + $document->getText() + ); + } + + /** + * Tests if a PDF with null or empty string headers trigger an Exception. + * + * It happened because there was a check missing in Parser.php (parseHeaderElement function). + * + * @see https://github.com/smalot/pdfparser/issues/557 + */ + public function testIssue557(): void + { + /** + * PDF provided by @DogLoc for usage in our test environment. + * + * @see https://github.com/smalot/pdfparser/pull/560#issue-1461437944 + */ + $filename = $this->rootDir.'/samples/bugs/Issue557.pdf'; + + $document = $this->fixture->parseFile($filename); + + $this->assertStringContainsString( + 'Metal Face Inductive Sensor', + $document->getText() + ); + } + + /** + * Tests if an integer overflow triggers a TypeError in Font::uchr. + * + * @see https://github.com/smalot/pdfparser/issues/621 + */ + public function testIssue621(): void + { + $document = $this->fixture->parseFile($this->rootDir.'/samples/bugs/Issue621.pdf'); + + $this->assertStringContainsString('What is a biological product?', $document->getText()); + } + + /** + * Tests behavior when changing default font space limit (-50). + * + * Test is based on testIssue359 (above). + */ + public function testChangedFontSpaceLimit(): void + { + $filename = $this->rootDir.'/samples/bugs/Issue359.pdf'; + + $config = new Config(); + $config->setFontSpaceLimit(1); // change default value + + $this->fixture = new Parser([], $config); + $document = $this->fixture->parseFile($filename); + + $this->assertStringContainsString('dni a 10 maj a 2018', $document->getText()); + } + + /** + * Tests if a given Config object is really used. + * Or if a default one is generated, if null was given. + */ + public function testUsageOfConfigObject(): void + { + // check default + $this->fixture = new Parser([]); + $this->assertEquals(new Config(), $this->fixture->getConfig()); + + // check default 2 + $this->fixture = new Parser([], null); + $this->assertEquals(new Config(), $this->fixture->getConfig()); + + // check given + $config = new Config(); + $config->setFontSpaceLimit(1000); + $this->fixture = new Parser([], $config); + $this->assertEquals($config, $this->fixture->getConfig()); + } + + /** + * Tests the impact of the retainImageContent config setting on memory usage + * + * @group memory-heavy + * + * @see https://github.com/smalot/pdfparser/issues/104#issuecomment-883422508 + */ + public function testRetainImageContentImpact(): void + { + if (version_compare(\PHP_VERSION, '7.3.0', '<')) { + $this->markTestSkipped('Garbage collection doesn\'t work reliably enough for this test in PHP < 7.3'); + } + + gc_collect_cycles(); + $baselineMemory = memory_get_usage(true); + + $filename = $this->rootDir.'/samples/bugs/Issue104a.pdf'; + $iterations = 2; + + /* + * check default (= true) + */ + $this->fixture = new Parser([]); + $this->assertTrue($this->fixture->getConfig()->getRetainImageContent()); + $document = null; + + for ($i = 0; $i < $iterations; ++$i) { + $document = $this->fixture->parseFile($filename); + } + + $usedMemory = memory_get_usage(true); + $this->assertGreaterThan($baselineMemory + 180000000, $usedMemory, 'Memory is only '.$usedMemory); + $this->assertTrue(null != $document && '' !== $document->getText()); + + // force garbage collection + $this->fixture = $document = null; + gc_collect_cycles(); + + /* + * check false + */ + $config = new Config(); + $config->setRetainImageContent(false); + $this->fixture = new Parser([], $config); + $this->assertEquals($config, $this->fixture->getConfig()); + + for ($i = 0; $i < $iterations; ++$i) { + $document = $this->fixture->parseFile($filename); + } + + $usedMemory = memory_get_usage(true); + /* + * note: the following memory value is set manually and may differ from system to system. + * it must be high enough to not produce a false negative though. + */ + $this->assertLessThan($baselineMemory * 1.05, $usedMemory, 'Memory is '.$usedMemory); + $this->assertTrue('' !== $document->getText()); + } + + /** + * Tests handling of encrypted PDF. + * + * @see https://github.com/smalot/pdfparser/pull/653 + */ + public function testNoIgnoreEncryption(): void + { + $filename = $this->rootDir.'/samples/not_really_encrypted.pdf'; + $threw = false; + try { + (new Parser([]))->parseFile($filename); + } catch (\Exception $e) { + // we expect an exception to be thrown if an encrypted PDF is encountered. + $threw = true; + } + $this->assertTrue($threw); + } + + /** + * Tests behavior if encryption is ignored. + * + * @see https://github.com/smalot/pdfparser/pull/653 + */ + public function testIgnoreEncryption(): void + { + $config = new Config(); + $config->setIgnoreEncryption(true); + + $filename = $this->rootDir.'/samples/not_really_encrypted.pdf'; + + $this->assertTrue((new Parser([], $config))->parseFile($filename) instanceof Document); + + // without the configuration option set, an exception would be thrown. + } +} + +class ParserSub extends Parser +{ + public function exposedParseObject($id, $structure, $document) + { + return $this->parseObject($id, $structure, $document); + } + + public function getObjects(): array + { + return $this->objects; + } +} diff --git a/tests/PHPUnit/Integration/RawData/FilterHelperTest.php b/tests/PHPUnit/Integration/RawData/FilterHelperTest.php new file mode 100644 index 00000000..c2f99e12 --- /dev/null +++ b/tests/PHPUnit/Integration/RawData/FilterHelperTest.php @@ -0,0 +1,211 @@ + + * + * @date 2020-06-01 + * + * @author Sébastien MALOT + * + * @date 2017-01-03 + * + * @license LGPLv3 + * + * @url + * + * PdfParser is a pdf library written in PHP, extraction oriented. + * Copyright (C) 2017 - Sébastien MALOT + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program. + * If not, see . + */ + +namespace PHPUnitTests\Integration\RawData; + +use PHPUnitTests\TestCase; +use Smalot\PdfParser\Parser; +use Smalot\PdfParser\RawData\FilterHelper; + +class FilterHelperTest extends TestCase +{ + protected function setUp(): void + { + parent::setUp(); + + $this->fixture = new FilterHelper(); + } + + /* + * Tests for filter ASCII85Decode + */ + + public function testDecodeFilterASCII85Decode(): void + { + $compressed = '6Z6g\Eb0<5ARlp)FE2)5B)'; // = Compressed string + $result = $this->fixture->decodeFilter('ASCII85Decode', $compressed); + + $this->assertEquals('Compressed string', $result); + } + + public function testDecodeFilterASCII85DecodeInitSequence(): void + { + $compressed = '<~6Z6g\Eb0<5ARlp)FE2)5B)'; // = Compressed string + $result = $this->fixture->decodeFilter('ASCII85Decode', $compressed); + + $this->assertEquals('Compressed string', $result); + } + + public function testDecodeFilterASCII85DecodeEndSequence(): void + { + $compressed = '6Z6g\Eb0<5ARlp)FE2)5B)~>'; // = Compressed string + $result = $this->fixture->decodeFilter('ASCII85Decode', $compressed); + + $this->assertEquals('Compressed string', $result); + } + + public function testDecodeFilterASCII85DecodeSpecificEndSequence(): void + { + $compressed = '+^6b<~>'; // = 0x215B33C0 = "![3\xC0" + $result = $this->fixture->decodeFilter('ASCII85Decode', $compressed); + + $this->assertEquals("\x21\x5B\x33\xC0", $result); + } + + /* + * Tests for filter ASCIIHexDecode + */ + + public function testDecodeFilterASCIIHexDecode(): void + { + $compressed = '43 6f 6d 70 72 65 73 73 65 64 20 73 74 72 69 6e 67'; // = Compressed string + $result = $this->fixture->decodeFilter('ASCIIHexDecode', $compressed); + + $this->assertEquals('Compressed string', $result); + } + + /* + * Tests for filter FlateDecode + */ + + public function testDecodeFilterFlateDecode(): void + { + $compressed = gzcompress('Compress me', 9); + $result = $this->fixture->decodeFilter('FlateDecode', $compressed); + + $this->assertEquals('Compress me', $result); + } + + /** + * How does function behave if an empty string was given. + */ + public function testDecodeFilterFlateDecodeEmptyString(): void + { + $this->expectException(\Exception::class); + $this->expectExceptionMessage('decodeFilterFlateDecode: invalid data'); + + $this->fixture->decodeFilter('FlateDecode', ''); + } + + /** + * How does function behave if an uncompressed string was given. + */ + public function testDecodeFilterFlateDecodeUncompressedString(): void + { + $this->expectException(\Exception::class); + $this->expectExceptionMessage('decodeFilterFlateDecode: invalid data'); + + $this->fixture->decodeFilter('FlateDecode', 'something'); + } + + /** + * How does function behave if compression checksum is CRC32 instead of Adler-32. + * See: https://github.com/smalot/pdfparser/issues/592 + */ + public function testDecodeFilterFlateDecodeCRC32Checksum(): void + { + $document = (new Parser())->parseFile($this->rootDir.'/samples/bugs/Issue592.pdf'); + + self::assertStringContainsString('Two Westbrook Corporate Center Suite 500', $document->getText()); + } + + /** + * How does function behave if an unknown filter name was given. + */ + public function testDecodeFilterUnknownFilter(): void + { + $result = $this->fixture->decodeFilter('a string '.rand(), 'something'); + $this->assertEquals('something', $result); + } + + /* + * Test for filters not being implemented yet. + */ + + /** + * CCITTFaxDecode + */ + public function testDecodeFilterCCITTFaxDecode(): void + { + $this->expectException(\Exception::class); + $this->expectExceptionMessage('Decode CCITTFaxDecode not implemented yet.'); + + $this->fixture->decodeFilter('CCITTFaxDecode', ''); + } + + /** + * Crypt + */ + public function testDecodeFilterCrypt(): void + { + $this->expectException(\Exception::class); + $this->expectExceptionMessage('Decode Crypt not implemented yet.'); + + $this->fixture->decodeFilter('Crypt', ''); + } + + /** + * DCTDecode + */ + public function testDecodeFilterDCTDecode(): void + { + $this->expectException(\Exception::class); + $this->expectExceptionMessage('Decode DCTDecode not implemented yet.'); + + $this->fixture->decodeFilter('DCTDecode', ''); + } + + /** + * JBIG2Decode + */ + public function testDecodeFilterJBIG2Decode(): void + { + $this->expectException(\Exception::class); + $this->expectExceptionMessage('Decode JBIG2Decode not implemented yet.'); + + $this->fixture->decodeFilter('JBIG2Decode', ''); + } + + /** + * JPXDecode + */ + public function testDecodeFilterJPXDecode(): void + { + $this->expectException(\Exception::class); + $this->expectExceptionMessage('Decode JPXDecode not implemented yet.'); + + $this->fixture->decodeFilter('JPXDecode', ''); + } +} diff --git a/tests/PHPUnit/Integration/RawData/RawDataParserTest.php b/tests/PHPUnit/Integration/RawData/RawDataParserTest.php new file mode 100644 index 00000000..515734c7 --- /dev/null +++ b/tests/PHPUnit/Integration/RawData/RawDataParserTest.php @@ -0,0 +1,318 @@ + + * + * @date 2020-06-01 + * + * @author Sébastien MALOT + * + * @date 2017-01-03 + * + * @license LGPLv3 + * + * @url + * + * PdfParser is a pdf library written in PHP, extraction oriented. + * Copyright (C) 2017 - Sébastien MALOT + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program. + * If not, see . + */ + +namespace PHPUnitTests\Integration\RawData; + +use PHPUnitTests\TestCase; +use Smalot\PdfParser\Config; +use Smalot\PdfParser\RawData\RawDataParser; + +class RawDataParserHelper extends RawDataParser +{ + /** + * Expose protected function "getRawObject". + */ + public function exposeGetRawObject($pdfData, $offset = 0) + { + return $this->getRawObject($pdfData, $offset); + } + + /** + * Expose protected function "getXrefData". + */ + public function exposeGetXrefData(string $pdfData, int $offset = 0, array $xref = [], array $visitedOffsets = []): array + { + return $this->getXrefData($pdfData, $offset, $xref, $visitedOffsets); + } + + /** + * Expose protected function "decodeXref". + */ + public function exposeDecodeXref(string $pdfData, int $startxref, array $xref = [], array $visitedOffsets = []): array + { + return $this->decodeXref($pdfData, $startxref, $xref, $visitedOffsets); + } + + /** + * Expose protected function "decodeXrefStream". + */ + public function exposeDecodeXrefStream(string $pdfData, int $startxref, array $xref = [], array $visitedOffsets = []): array + { + return $this->decodeXrefStream($pdfData, $startxref, $xref, $visitedOffsets); + } +} + +class RawDataParserTest extends TestCase +{ + protected function setUp(): void + { + parent::setUp(); + + $this->fixture = new RawDataParserHelper([], new Config()); + } + + /** + * Tests buggy behavior of getRawObject. + * + * When PDF has corrupted xref table getRawObject may run into an infinite loop. + * + * @see https://github.com/smalot/pdfparser/issues/372 + * @see https://github.com/smalot/pdfparser/pull/377 + */ + public function testGetRawObjectIssue372(): void + { + // The following $data content is a minimal example to trigger the infinite loop + $data = '<>'; + + // calling "getRawObject" via "exposeGetRawObject" would result in an infinite loop + // if the fix is not there. + $result = $this->fixture->exposeGetRawObject($data); + + $this->assertEquals( + [ + '<<', + [ + ['/', 'Producer', 11], + ['(', 'eDkºãa˜þõ‚LÅòÕ�PïÙ��', 52], + ], + 52, + ], + $result + ); + + // Test that spaces after a 'stream' declaration are absorbed + // See: https://github.com/smalot/pdfparser/issues/641 + $data = 'stream '."\n"; + $data .= 'streamdata'."\n"; + $data .= 'endstream'."\n"; + $data .= 'endobj'; + + $result = $this->fixture->exposeGetRawObject($data); + + // Value 'streamdata'."\n" would be empty string without the fix + $this->assertEquals( + [ + 'stream', + 'streamdata'."\n", + 19, + ], + $result + ); + } + + /** + * Tests buggy behavior of decodeXrefStream. + * + * @see https://github.com/smalot/pdfparser/issues/30 + * @see https://github.com/smalot/pdfparser/issues/192 + * @see https://github.com/smalot/pdfparser/issues/209 + * @see https://github.com/smalot/pdfparser/issues/330 + * @see https://github.com/smalot/pdfparser/issues/356 + * @see https://github.com/smalot/pdfparser/issues/373 + * @see https://github.com/smalot/pdfparser/issues/392 + * @see https://github.com/smalot/pdfparser/issues/397 + */ + public function testDecodeXrefStreamIssue356(): void + { + $filename = $this->rootDir.'/samples/bugs/Issue356.pdf'; + + $parser = $this->getParserInstance(); + $document = $parser->parseFile($filename); + $pages = $document->getPages(); + + $this->assertStringContainsString('Ημερήσια έκθεση επιδημιολογικής', $pages[0]->getText()); + } + + public function testDecodeObjectHeaderIssue405(): void + { + $filename = $this->rootDir.'/samples/bugs/Issue405.pdf'; + + $parser = $this->getParserInstance(); + $document = $parser->parseFile($filename); + $pages = $document->getPages(); + + $this->assertStringContainsString('Bug fix: PR #405', $pages[0]->getText()); + } + + /** + * Tests buggy behavior of decodeXrefStream. + * + * When PDF has more than one entry in the /Index area (for example by changing + * the document description), only the first entry is used. + * If the fix is not used the array returned by getDetails() contains only the entry + * with the key 'Pages'. All other entries like 'Author', 'Creator', 'Title', + * 'Subject' (which come from the 'Info' object) are not listed, because the + * 'Info' object gets a wrong object id during parsing the data into the xref structure. + * So the object id listed at the /Info entry is not valid and the data of the info object + * cannot be loaded during executing Document::buildDetails(). + * + * @see https://github.com/smalot/pdfparser/pull/479 + */ + public function testDecodeXrefStreamIssue479(): void + { + $filename = $this->rootDir.'/samples/bugs/Issue479.pdf'; + + $parser = $this->getParserInstance(); + $document = $parser->parseFile($filename); + $details = $document->getDetails(); + + $this->assertArrayHasKey('Author', $details); + $this->assertArrayHasKey('CreationDate', $details); + $this->assertArrayHasKey('Creator', $details); + $this->assertArrayHasKey('ModDate', $details); + $this->assertArrayHasKey('Producer', $details); + $this->assertArrayHasKey('Subject', $details); + $this->assertArrayHasKey('Title', $details); + } + + /** + * Account for inaccurate offset values in getXrefData. + * + * Normally offset values extracted from the PDF document are exact. + * However in some cases, they may point to whitespace *before* a + * valid xref keyword. Move the offset forward past whitespace to + * make this function a little more lenient. + * + * @see https://github.com/smalot/pdfparser/issues/673 + */ + public function testGetXrefDataIssue673(): void + { + $filename = $this->rootDir.'/samples/bugs/Issue673.pdf'; + + // Parsing this document would previously throw an Exception + $parser = $this->getParserInstance(); + $document = $parser->parseFile($filename); + $text = $document->getText(); + + self::assertStringContainsString('6 rue des Goutais', $text); + } + + /** + * Handle self referencing xref + * + * It seems that some PDF creators output `Prev 0` when there is no previous xref. + * + * @see https://github.com/smalot/pdfparser/pull/727 + */ + public function testDecodeXrefIssue727(): void + { + $filename = $this->rootDir.'/samples/bugs/Issue727.pdf'; + + // Parsing this document would previously cause an infinite loop + $parser = $this->getParserInstance(); + $document = $parser->parseFile($filename); + $text = $document->getText(); + + self::assertStringContainsString('', $text); + } + + /** + * Test that getXrefData prevents circular references + * + * When a PDF has circular references in xref chain (e.g., Prev pointing to already visited offset), + * the parser should detect this and stop recursion to prevent infinite loops. + */ + public function testGetXrefDataPreventsCircularReferences(): void + { + // Create a minimal PDF structure with xref that would create a circular reference + $pdfData = "%PDF-1.5\n"; + $pdfData .= "xref\n"; + $pdfData .= "0 1\n"; + $pdfData .= "0000000000 65535 f \n"; + $pdfData .= "trailer\n"; + $pdfData .= "<>\n"; // Prev points back to offset 7 (the xref keyword) + $pdfData .= "startxref\n"; + $pdfData .= "7\n"; + $pdfData .= "%%EOF\n"; + + // Test with visitedOffsets containing the offset we're trying to visit + $result = $this->fixture->exposeGetXrefData($pdfData, 7, [], [7]); + + // Should return empty xref array without recursing + $this->assertIsArray($result); + $this->assertEmpty($result); + } + + /** + * Test that decodeXref passes visitedOffsets correctly when handling Prev + * + * This ensures that circular reference detection works when decodeXref + * calls getXrefData for a Prev pointer. + */ + public function testDecodeXrefPassesVisitedOffsets(): void + { + // Create a minimal xref structure with Prev + $pdfData = "xref\n"; + $pdfData .= "0 1\n"; + $pdfData .= "0000000000 65535 f \n"; + $pdfData .= "trailer\n"; + $pdfData .= "<>\n"; + + // Call decodeXref with visitedOffsets that includes the Prev offset + // This should not cause infinite recursion + $result = $this->fixture->exposeDecodeXref($pdfData, 0, [], [100]); + + // Should complete without error and return an array + $this->assertIsArray($result); + $this->assertArrayHasKey('trailer', $result); + } + + /** + * Test that getXrefData tracks visited offsets correctly + * + * Ensures that offsets are added to visitedOffsets array to prevent + * circular references in subsequent calls. + */ + public function testGetXrefDataTracksVisitedOffsets(): void + { + // Test that calling with an already-visited offset returns immediately + $pdfData = "%PDF-1.5\n"; + $pdfData .= "xref\n"; + $pdfData .= "0 1\n"; + $pdfData .= "0000000000 65535 f \n"; + $pdfData .= "trailer\n"; + $pdfData .= "<>\n"; + $pdfData .= "startxref\n"; + $pdfData .= "7\n"; + $pdfData .= "%%EOF\n"; + + // Call with offset 50 already in visitedOffsets - should return immediately + $result = $this->fixture->exposeGetXrefData($pdfData, 50, [], [50]); + + // Should return empty array without processing + $this->assertIsArray($result); + $this->assertEmpty($result); + } +} diff --git a/tests/PHPUnit/TestCase.php b/tests/PHPUnit/TestCase.php new file mode 100644 index 00000000..08d4739a --- /dev/null +++ b/tests/PHPUnit/TestCase.php @@ -0,0 +1,74 @@ + + * + * @date 2020-06-02 + * + * @author Sébastien MALOT + * + * @date 2017-01-03 + * + * @license LGPLv3 + * + * @url + * + * PdfParser is a pdf library written in PHP, extraction oriented. + * Copyright (C) 2017 - Sébastien MALOT + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program. + * If not, see . + */ + +namespace PHPUnitTests; + +use PHPUnit\Framework\TestCase as PHPTestCase; +use Smalot\PdfParser\Config; +use Smalot\PdfParser\Document; +use Smalot\PdfParser\Element; +use Smalot\PdfParser\Parser; + +abstract class TestCase extends PHPTestCase +{ + /** + * Contains an instance of the class to test. + */ + protected $fixture; + + protected $rootDir; + + protected function setUp(): void + { + parent::setUp(); + + $this->rootDir = __DIR__.'/../..'; + } + + protected function getDocumentInstance(): Document + { + return new Document(); + } + + protected function getElementInstance($value): Element + { + return new Element($value); + } + + protected function getParserInstance(?Config $config = null): Parser + { + return new Parser([], $config); + } +} diff --git a/tests/PHPUnit/Unit/ConfigTest.php b/tests/PHPUnit/Unit/ConfigTest.php new file mode 100644 index 00000000..133c7eb1 --- /dev/null +++ b/tests/PHPUnit/Unit/ConfigTest.php @@ -0,0 +1,78 @@ + + * + * @date 2020-12-15 + * + * @license LGPLv3 + * + * @url + * + * PdfParser is a pdf library written in PHP, extraction oriented. + * Copyright (C) 2017 - Sébastien MALOT + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program. + * If not, see . + */ + +namespace PHPUnitTests\Unit; + +use PHPUnitTests\TestCase; +use Smalot\PdfParser\Config; + +class ConfigTest extends TestCase +{ + protected function setUp(): void + { + parent::setUp(); + + $this->fixture = new Config(); + } + + /** + * Tests setter and getter for font space limit. + */ + public function testFontSpaceLimitSetterGetter(): void + { + $this->assertEquals(-50, $this->fixture->getFontSpaceLimit()); + + $this->fixture->setFontSpaceLimit(1); + $this->assertEquals(1, $this->fixture->getFontSpaceLimit()); + } + + /** + * Tests setter and getter for horizontal offset. + */ + public function testHorizontalOffsetSetterGetter() + { + $this->assertEquals(' ', $this->fixture->getHorizontalOffset()); + + $this->fixture->setHorizontalOffset(' '); + $this->assertEquals(' ', $this->fixture->getHorizontalOffset()); + } + + /** + * Tests setter and getter for retaining of raw image data. + */ + public function testRetainImageContentSetterGetter(): void + { + $this->assertTrue($this->fixture->getRetainImageContent()); + + $this->fixture->setRetainImageContent(false); + $this->assertFalse($this->fixture->getRetainImageContent()); + } +} diff --git a/tests/PHPUnit/Unit/DocumentTest.php b/tests/PHPUnit/Unit/DocumentTest.php new file mode 100644 index 00000000..e28ee9c7 --- /dev/null +++ b/tests/PHPUnit/Unit/DocumentTest.php @@ -0,0 +1,18 @@ +assertNull($document->getFirstFont()); + } +} diff --git a/tests/PHPUnit/Unit/EncodingTest.php b/tests/PHPUnit/Unit/EncodingTest.php new file mode 100644 index 00000000..7116b5dc --- /dev/null +++ b/tests/PHPUnit/Unit/EncodingTest.php @@ -0,0 +1,19 @@ +assertNull($encoding->translateChar('foo')); + } +} diff --git a/tests/PHPUnit/Unit/FontTest.php b/tests/PHPUnit/Unit/FontTest.php new file mode 100644 index 00000000..f60818ff --- /dev/null +++ b/tests/PHPUnit/Unit/FontTest.php @@ -0,0 +1,71 @@ + + * + * @date 2023-07-19 + * + * @license LGPLv3 + * + * @url + * + * PdfParser is a pdf library written in PHP, extraction oriented. + * Copyright (C) 2017 - Sébastien MALOT + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program. + * If not, see . + */ + +namespace PHPUnitTests\Unit; + +use PHPUnitTests\TestCase; +use Smalot\PdfParser\Config; +use Smalot\PdfParser\Document; +use Smalot\PdfParser\Font; +use Smalot\PdfParser\PDFObject; + +class FontTest extends TestCase +{ + /** + * decodeText must decode \b. + * + * @see https://github.com/smalot/pdfparser/pull/597 + */ + public function testDecodeTextIssue597(): void + { + $config = $this->createMock(Config::class); + $config->method('getFontSpaceLimit')->willReturn(1); + + $document = $this->createMock(Document::class); + $sut = new Font($document, null, null, $config); + + $commands = [ + [ + PDFObject::TYPE => '<', + PDFObject::COMMAND => "\b", + ], + ]; + + // result is a binary string and looks like: 0x3cc2ab083e + $result = $sut->decodeText($commands); + + // check that \b is not part of the result anymore + self::assertFalse(strpos($result, "\b>")); + + // compare result with expected value + self::assertEquals('3cc2ab083e', bin2hex($result)); + } +} diff --git a/tests/PHPUnit/Unit/PDFObjectTest.php b/tests/PHPUnit/Unit/PDFObjectTest.php new file mode 100644 index 00000000..4224318a --- /dev/null +++ b/tests/PHPUnit/Unit/PDFObjectTest.php @@ -0,0 +1,79 @@ +getText()); + } + + public function testGetTextOnPageWithoutContent(): void + { + $document = new Document(); + + static::assertSame(' ', (new PDFObject($document, null, null))->getText(new Page($document))); + } + + public function testTextArrayObjects(): void + { + $document = new Document(); + $document->init(); + + $image = new Image($document); + $form = new Form($document); + $xObject = new PDFObject($document); + + $header1 = new Header([ + 'Resources' => new Header([ + 'XObject' => new Header([ + 'Im0' => $image, + ]) + ]), + 'Contents' => new ElementArray([new Element('/Im0 Do', $document)], $document), + ]); + $page1 = new Page($document, $header1); + + $header2 = new Header([ + 'Resources' => new Header([ + 'XObject' => new Header([ + 'Fr0' => $form, + ]) + ]), + 'Contents' => new ElementArray([new Element('/Fr0 Do', $document)], $document), + ]); + $page2 = new Page($document, $header2); + + $header3 = new Header([ + 'Resources' => new Header([ + 'XObject' => new Header([ + 'Ps0' => $xObject, + ]) + ]), + 'Contents' => new ElementArray([new Element('/Ps0 Do', $document)], $document), + ]); + $page3 = new Page($document, $header3); + + // Page 1 contains an image, which should not appear in the text array. + self::assertSame([], $page1->getTextArray()); + + // Page 2 contains a form, which should not appear in the text array. + self::assertSame([], $page2->getTextArray()); + + // Page 3 contains a non-image object, which should appear in the text array. + self::assertSame([' '], $page3->getTextArray()); + } +} diff --git a/tests/Performance/AbstractPerformanceTest.php b/tests/Performance/AbstractPerformanceTest.php new file mode 100644 index 00000000..f14f1ce1 --- /dev/null +++ b/tests/Performance/AbstractPerformanceTest.php @@ -0,0 +1,21 @@ + + * + * @date 2020-06-01 + * + * @author Sébastien MALOT + * + * @date 2017-01-03 + * + * @license LGPLv3 + * + * @url + * + * PdfParser is a pdf library written in PHP, extraction oriented. + * Copyright (C) 2017 - Sébastien MALOT + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program. + * If not, see . + */ + +namespace PerformanceTests\Test; + +use PerformanceTests\AbstractPerformanceTest; +use Smalot\PdfParser\Parser; + +/** + * This test checks does a performance test with certain PDF files that extensively use + * the getFirstFont() method of Document.php. If Document.php correctly uses a dictionary + * to cache the objects inside the PDF file, then the parsing should be quick. + * If it does not, the parsing can be extensively slow or even crash. + */ +class DocumentDictionaryCacheTest extends AbstractPerformanceTest +{ + /** + * @var Parser + */ + protected $parser; + protected $data; + + public function init(): void + { + $this->parser = new Parser(); + + // load PDF file content + $this->data = file_get_contents(__DIR__.'/../../../samples/DocumentWithLotsOfObjects.pdf'); + } + + public function run(): void + { + // give PDF content to function and parse it + $pdf = $this->parser->parseContent($this->data); + + $pages = $pdf->getPages(); + + foreach ($pages as $i => $page) { /* @var $page Page */ + if ($i < 77) { + continue; + } + if ($i > 78) { + continue; + } + + $page->getText(); // Test this method + } + } + + public function getMaxEstimatedTime(): int + { + return 20; + } +} diff --git a/tests/Performance/runPerformanceTests.php b/tests/Performance/runPerformanceTests.php new file mode 100644 index 00000000..fa983105 --- /dev/null +++ b/tests/Performance/runPerformanceTests.php @@ -0,0 +1,31 @@ +init(); + + $startTime = microtime(true); + $test->run(); + $endTime = microtime(true); + + $time = $endTime - $startTime; + + if ($test->getMaxEstimatedTime() <= $time) { + $msg = sprintf( + 'Performance failed on test "%s". Time taken was %.2f seconds, expected less than %d seconds.', + get_class($test), + $time, + $test->getMaxEstimatedTime() + ); + + throw new PerformanceFailException($msg); + } +}