Skip to content

Commit

Permalink
Fix for two bugs related to Unicode translation support by Font objects
Browse files Browse the repository at this point in the history
Symptom was that some documents' contents was rendering as a bunch of
control characters.  These are the untranslated strings.  This was
happening because for two different reasons, these strings weren't being
translated \Smalot\PdfParser\Font::decodeContent() in some circumstances.

First fix is to \Smalot\PdfParser\Font::loadTranslateTable():

  - Fixed bug where bfchar sections weren't loaded due to mistake in regexp.
  - It now uses `*` instead of `+` and thus supports translation tables with
    lines like `<0000><0000>`.  (Required `<0000> <0000>` before.)

Second fix is for documents that attach their Font objects to the Pages
object instead of each Page object:

  - \Smalot\PdfParser\Page now has a setFonts() method
  - \Smalot\PdfParser\Pages now declares its $fonts variable
  - \Smalot\PdfParser\Pages::getPages() now applies the object's fonts to each child Page
  - \Smalot\PdfParser\Pages::getFonts() copied from Page class
  • Loading branch information
unixnut committed Jan 19, 2024
1 parent 268a620 commit cb1a70d
Show file tree
Hide file tree
Showing 3 changed files with 61 additions and 1 deletion.
2 changes: 1 addition & 1 deletion src/Smalot/PdfParser/Font.php
Original file line number Diff line number Diff line change
Expand Up @@ -191,7 +191,7 @@ public function loadTranslateTable(): array
// Support for multiple bfchar sections
if (preg_match_all('/beginbfchar(?P<sections>.*?)endbfchar/s', $content, $matches)) {
foreach ($matches['sections'] as $section) {
$regexp = '/<(?P<from>[0-9A-F]+)> +<(?P<to>[0-9A-F]+)>[ \r\n]+/is';
$regexp = '/<(?P<from>[0-9A-F]+)> *<(?P<to>[0-9A-F]+)>[ \r\n]+/is';

preg_match_all($regexp, $section, $matches);

Expand Down
7 changes: 7 additions & 0 deletions src/Smalot/PdfParser/Page.php
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,13 @@ class Page extends PDFObject
*/
protected $dataTm;

public function setFonts($fonts)
{
if (empty($this->fonts)) {
$this->fonts = $fonts;
}
}

/**
* @return Font[]
*/
Expand Down
53 changes: 53 additions & 0 deletions src/Smalot/PdfParser/Pages.php
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,11 @@
*/
class Pages extends PDFObject
{
/**
* @var Font[]
*/
protected $fonts;

/**
* @todo Objects other than Pages or Page might need to be treated specifically in order to get Page objects out of them,
*
Expand All @@ -57,17 +62,65 @@ public function getPages(bool $deep = false): array
return $kidsElement->getContent();
}

// Prepare to apply the Pages' object's fonts to each page
$fonts = $this->getFonts();

$kids = $kidsElement->getContent();
$pages = [];

foreach ($kids as $kid) {
if ($kid instanceof self) {
$pages = array_merge($pages, $kid->getPages(true));
} elseif ($kid instanceof Page) {
if (!empty($this->fonts)) {
$kid->setFonts($fonts);
}
$pages[] = $kid;
}
}

return $pages;
}

/**
* @return Font[]
*/
public function getFonts()
{
if (null !== $this->fonts) {
return $this->fonts;
}

$resources = $this->get('Resources');

if (method_exists($resources, 'has') && $resources->has('Font')) {
if ($resources->get('Font') instanceof ElementMissing) {
return [];
}

if ($resources->get('Font') instanceof Header) {
$fonts = $resources->get('Font')->getElements();
} else {
$fonts = $resources->get('Font')->getHeader()->getElements();
}

$table = [];

foreach ($fonts as $id => $font) {
if ($font instanceof Font) {
$table[$id] = $font;

// Store too on cleaned id value (only numeric)
$id = preg_replace('/[^0-9\.\-_]/', '', $id);
if ('' != $id) {
$table[$id] = $font;
}
}
}

return $this->fonts = $table;
}

return [];
}
}

0 comments on commit cb1a70d

Please sign in to comment.