Merge pull request #6 from johnjor/image-data-src

Images with base64 encoded data src
johnjor · Dec 4, 2024 · 14e0f43 · 14e0f43
2 parents f5f2612 + 69131e9
commit 14e0f43
Show file tree

Hide file tree

Showing 5 changed files with 90 additions and 7 deletions.
diff --git a/README.md b/README.md
@@ -16,8 +16,11 @@ PyPI: https://pypi.org/project/html3docx/
 - Images with a width attribute will be scaled according to that width.
 - Fix for AttributeError when handling a leading br tag, either at the top of the HTML snippet, or within a td or th cell.
 - Fix for IndexError when a table has more cells in latter rows than in the first row.
+- Ordered lists will now restart at 1. when proceeded by a paragraph that is not a numbered list.
 - Parameterized image fetcher function.
+- Parameterized default styles for OL, UL, and TABLE tags.
 - Fix for KeyError when handling an anchor with no href attribute.
+- Added support for images with base64 encoded data src.
 
 ## Original README
 

diff --git a/htmldocx/h2d.py b/htmldocx/h2d.py
@@ -12,15 +12,18 @@
 
 How to deal with block level style applied over table elements? e.g. text align
 """
+import binascii
 import re, argparse
 import io, os
 import urllib.request
+from base64 import b64decode
 from urllib.parse import urlparse
 from html.parser import HTMLParser
 from typing import Callable
 
 import docx, docx.table
 from docx import Document
+from docx.image.exceptions import UnrecognizedImageError
 from docx.shared import RGBColor, Inches
 from docx.enum.text import WD_COLOR, WD_ALIGN_PARAGRAPH
 from docx.image.image import Image
@@ -41,6 +44,8 @@
 # Style to use with paragraphs. By default no style is used.
 DEFAULT_PARAGRAPH_STYLE = None
 
+RE_DATA_SRC = re.compile(r"data:([\w\d/-]+)?;?(base64)?,(.+)")
+
 
 def get_filename_from_url(url):
     return os.path.basename(urlparse(url).path)
@@ -55,6 +60,14 @@ def is_url(url):
     return all([parts.scheme, parts.netloc, parts.path])
 
 
+def parse_data_src(src):
+    result = re.match(RE_DATA_SRC, src)
+    media_type = result.group(1) or "text/plain"
+    is_base64 = result.group(2) == "base64"
+    data = result.group(3)
+    return media_type, is_base64, data
+
+
 def fetch_image(url):
     """
     Attempts to fetch an image from a url. 
@@ -359,7 +372,17 @@ def handle_img(self, current_attrs):
             except urllib.error.URLError:
                 image = None
         else:
-            image = src
+            if src.startswith("data:"):
+                _, is_base64, data = parse_data_src(src)
+                if is_base64:
+                    try:
+                        image = io.BytesIO(b64decode(data))
+                    except binascii.Error:
+                        image = None
+                else:
+                    image = data  # TODO: URL-decode, but why would an image not be base64
+            else:
+                image = None
         # add image to doc
         if image:
             try:
@@ -378,7 +401,7 @@ def handle_img(self, current_attrs):
                         self.doc.add_picture(image)
                 else:
                     self.add_image_to_cell(self.doc, image)
-            except FileNotFoundError:
+            except (FileNotFoundError, UnrecognizedImageError):
                 image = None
         if not image:
             if src_is_url:
@@ -602,9 +625,9 @@ def handle_data(self, data):
         # https://html.spec.whatwg.org/#interactive-content
         link = self.tags.get('a')
         if link:
-            self.handle_link(link.get("href", ""), data)
+            self.handle_link(link.get("href", ""), data)  # TODO: IDE says link is a list, but .get() works fine
         else:
-            # If there's a link, dont put the data directly in the run
+            # If there's a link, don't put the data directly in the run
             self.run = self.paragraph.add_run(data)
             spans = self.tags['span']
             for span in spans:

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "html3docx"
-version = "1.0.0"
+version = "1.0.1"
 authors = [
   {name="John Jordan"},
 ]

diff --git a/requirements.txt b/requirements.txt
@@ -1,2 +1,2 @@
-beautifulsoup4==4.8.0
-python-docx==0.8.10
+beautifulsoup4>=4.8.0,<5
+python-docx>=0.8.10
diff --git a/tests/test.py b/tests/test.py
@@ -5,6 +5,7 @@
 import unittest
 from docx import Document
 from .context import HtmlToDocx, test_dir
+from htmldocx.h2d import parse_data_src
 
 
 class OutputTest(unittest.TestCase):
@@ -234,6 +235,32 @@ def test_image_sizes(self):
             "<img src='https://upload.wikimedia.org/wikipedia/commons/8/88/A_storm_at_Pors-Loubous.jpg' />",
             self.document)
 
+    def test_image_data_src(self):
+        self.document.add_heading("An image using a data src", level=1)
+        self.parser.add_html_to_document(
+            '<img alt="" src="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAUAAAAFCAYAAACNbyblAAAAHElEQVQI12P4//8/w38GIAXDIBKE0DHxgljNBAAO9TXL0Y4OHwAAAABJRU5ErkJggg==" style="width:36pt;height:36pt" />',
+            self.document
+        )
+
+        for p in self.document.paragraphs:
+            assert "iVBORw0KGgoAAAANSUhEUgAAAAUAAAAFCAYAAACNbyblAAAAHElEQVQI12P4//8/w38GIAXDIBKE0DHxgljNBAAO9TXL0Y4OHwAAAABJRU5ErkJggg==" not in p.text, p.text
+
+
+    def test_image_data_src_in_table(self):
+        self.document.add_heading("An image using a data src in a table", level=1)
+        self.parser.add_html_to_document(
+            """
+            <table><tbody><tr><td>
+            <img alt="" src="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAUAAAAFCAYAAACNbyblAAAAHElEQVQI12P4//8/w38GIAXDIBKE0DHxgljNBAAO9TXL0Y4OHwAAAABJRU5ErkJggg==" style="width:36pt;height:36pt" />
+            </td></tr></tbody></table>
+            """,
+            self.document
+        )
+
+        for p in self.document.paragraphs:
+            assert "iVBORw0KGgoAAAANSUhEUgAAAAUAAAAFCAYAAACNbyblAAAAHElEQVQI12P4//8/w38GIAXDIBKE0DHxgljNBAAO9TXL0Y4OHwAAAABJRU5ErkJggg==" not in p.text, p.text
+
+
     def test_image_no_src(self):
         self.document.add_heading(
             'Test: Handling IMG without SRC',
@@ -318,5 +345,35 @@ def custom_fetcher(url):
             "</td></tr></tbody></table>", self.document)
 
 
+class UnitTest(unittest.TestCase):
+    @staticmethod
+    def test_parse_data_src1():
+        media_type, is_base64, data = parse_data_src("data:,Hello%2C%20World%21")
+        assert media_type == "text/plain"
+        assert is_base64 == False
+        assert data == "Hello%2C%20World%21"
+
+    @staticmethod
+    def test_parse_data_src2():
+        media_type, is_base64, data = parse_data_src("data:text/html,%3Ch1%3EHello%2C%20World%21%3C%2Fh1%3E")
+        assert media_type == "text/html"
+        assert is_base64 == False
+        assert data == "%3Ch1%3EHello%2C%20World%21%3C%2Fh1%3E"
+
+    @staticmethod
+    def test_parse_data_src3():
+        media_type, is_base64, data = parse_data_src("data:;base64,SGVsbG8sIFdvcmxkIQ==")
+        assert media_type == "text/plain"
+        assert is_base64 == True
+        assert data == "SGVsbG8sIFdvcmxkIQ=="
+
+    @staticmethod
+    def test_parse_data_src4():
+        media_type, is_base64, data = parse_data_src("data:image/png;base64,iVBORw0KGgoAAA")
+        assert media_type == "image/png"
+        assert is_base64 == True
+        assert data == "iVBORw0KGgoAAA"
+
+
 if __name__ == '__main__':
     unittest.main()