Skip to content

Commit

Permalink
Merge pull request #6 from johnjor/image-data-src
Browse files Browse the repository at this point in the history
Images with base64 encoded data src
  • Loading branch information
johnjor authored Dec 4, 2024
2 parents f5f2612 + 69131e9 commit 14e0f43
Show file tree
Hide file tree
Showing 5 changed files with 90 additions and 7 deletions.
3 changes: 3 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,11 @@ PyPI: https://pypi.org/project/html3docx/
- Images with a width attribute will be scaled according to that width.
- Fix for AttributeError when handling a leading br tag, either at the top of the HTML snippet, or within a td or th cell.
- Fix for IndexError when a table has more cells in latter rows than in the first row.
- Ordered lists will now restart at 1. when proceeded by a paragraph that is not a numbered list.
- Parameterized image fetcher function.
- Parameterized default styles for OL, UL, and TABLE tags.
- Fix for KeyError when handling an anchor with no href attribute.
- Added support for images with base64 encoded data src.

## Original README

Expand Down
31 changes: 27 additions & 4 deletions htmldocx/h2d.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,15 +12,18 @@
How to deal with block level style applied over table elements? e.g. text align
"""
import binascii
import re, argparse
import io, os
import urllib.request
from base64 import b64decode
from urllib.parse import urlparse
from html.parser import HTMLParser
from typing import Callable

import docx, docx.table
from docx import Document
from docx.image.exceptions import UnrecognizedImageError
from docx.shared import RGBColor, Inches
from docx.enum.text import WD_COLOR, WD_ALIGN_PARAGRAPH
from docx.image.image import Image
Expand All @@ -41,6 +44,8 @@
# Style to use with paragraphs. By default no style is used.
DEFAULT_PARAGRAPH_STYLE = None

RE_DATA_SRC = re.compile(r"data:([\w\d/-]+)?;?(base64)?,(.+)")


def get_filename_from_url(url):
return os.path.basename(urlparse(url).path)
Expand All @@ -55,6 +60,14 @@ def is_url(url):
return all([parts.scheme, parts.netloc, parts.path])


def parse_data_src(src):
result = re.match(RE_DATA_SRC, src)
media_type = result.group(1) or "text/plain"
is_base64 = result.group(2) == "base64"
data = result.group(3)
return media_type, is_base64, data


def fetch_image(url):
"""
Attempts to fetch an image from a url.
Expand Down Expand Up @@ -359,7 +372,17 @@ def handle_img(self, current_attrs):
except urllib.error.URLError:
image = None
else:
image = src
if src.startswith("data:"):
_, is_base64, data = parse_data_src(src)
if is_base64:
try:
image = io.BytesIO(b64decode(data))
except binascii.Error:
image = None
else:
image = data # TODO: URL-decode, but why would an image not be base64
else:
image = None
# add image to doc
if image:
try:
Expand All @@ -378,7 +401,7 @@ def handle_img(self, current_attrs):
self.doc.add_picture(image)
else:
self.add_image_to_cell(self.doc, image)
except FileNotFoundError:
except (FileNotFoundError, UnrecognizedImageError):
image = None
if not image:
if src_is_url:
Expand Down Expand Up @@ -602,9 +625,9 @@ def handle_data(self, data):
# https://html.spec.whatwg.org/#interactive-content
link = self.tags.get('a')
if link:
self.handle_link(link.get("href", ""), data)
self.handle_link(link.get("href", ""), data) # TODO: IDE says link is a list, but .get() works fine
else:
# If there's a link, dont put the data directly in the run
# If there's a link, don't put the data directly in the run
self.run = self.paragraph.add_run(data)
spans = self.tags['span']
for span in spans:
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[project]
name = "html3docx"
version = "1.0.0"
version = "1.0.1"
authors = [
{name="John Jordan"},
]
Expand Down
4 changes: 2 additions & 2 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
beautifulsoup4==4.8.0
python-docx==0.8.10
beautifulsoup4>=4.8.0,<5
python-docx>=0.8.10
57 changes: 57 additions & 0 deletions tests/test.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import unittest
from docx import Document
from .context import HtmlToDocx, test_dir
from htmldocx.h2d import parse_data_src


class OutputTest(unittest.TestCase):
Expand Down Expand Up @@ -234,6 +235,32 @@ def test_image_sizes(self):
"<img src='https://upload.wikimedia.org/wikipedia/commons/8/88/A_storm_at_Pors-Loubous.jpg' />",
self.document)

def test_image_data_src(self):
self.document.add_heading("An image using a data src", level=1)
self.parser.add_html_to_document(
'<img alt="" src="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAUAAAAFCAYAAACNbyblAAAAHElEQVQI12P4//8/w38GIAXDIBKE0DHxgljNBAAO9TXL0Y4OHwAAAABJRU5ErkJggg==" style="width:36pt;height:36pt" />',
self.document
)

for p in self.document.paragraphs:
assert "iVBORw0KGgoAAAANSUhEUgAAAAUAAAAFCAYAAACNbyblAAAAHElEQVQI12P4//8/w38GIAXDIBKE0DHxgljNBAAO9TXL0Y4OHwAAAABJRU5ErkJggg==" not in p.text, p.text


def test_image_data_src_in_table(self):
self.document.add_heading("An image using a data src in a table", level=1)
self.parser.add_html_to_document(
"""
<table><tbody><tr><td>
<img alt="" src="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAUAAAAFCAYAAACNbyblAAAAHElEQVQI12P4//8/w38GIAXDIBKE0DHxgljNBAAO9TXL0Y4OHwAAAABJRU5ErkJggg==" style="width:36pt;height:36pt" />
</td></tr></tbody></table>
""",
self.document
)

for p in self.document.paragraphs:
assert "iVBORw0KGgoAAAANSUhEUgAAAAUAAAAFCAYAAACNbyblAAAAHElEQVQI12P4//8/w38GIAXDIBKE0DHxgljNBAAO9TXL0Y4OHwAAAABJRU5ErkJggg==" not in p.text, p.text


def test_image_no_src(self):
self.document.add_heading(
'Test: Handling IMG without SRC',
Expand Down Expand Up @@ -318,5 +345,35 @@ def custom_fetcher(url):
"</td></tr></tbody></table>", self.document)


class UnitTest(unittest.TestCase):
@staticmethod
def test_parse_data_src1():
media_type, is_base64, data = parse_data_src("data:,Hello%2C%20World%21")
assert media_type == "text/plain"
assert is_base64 == False
assert data == "Hello%2C%20World%21"

@staticmethod
def test_parse_data_src2():
media_type, is_base64, data = parse_data_src("data:text/html,%3Ch1%3EHello%2C%20World%21%3C%2Fh1%3E")
assert media_type == "text/html"
assert is_base64 == False
assert data == "%3Ch1%3EHello%2C%20World%21%3C%2Fh1%3E"

@staticmethod
def test_parse_data_src3():
media_type, is_base64, data = parse_data_src("data:;base64,SGVsbG8sIFdvcmxkIQ==")
assert media_type == "text/plain"
assert is_base64 == True
assert data == "SGVsbG8sIFdvcmxkIQ=="

@staticmethod
def test_parse_data_src4():
media_type, is_base64, data = parse_data_src("data:image/png;base64,iVBORw0KGgoAAA")
assert media_type == "image/png"
assert is_base64 == True
assert data == "iVBORw0KGgoAAA"


if __name__ == '__main__':
unittest.main()

0 comments on commit 14e0f43

Please sign in to comment.