Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Basic table support #55

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 46 additions & 0 deletions html2docx/html2docx.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,8 @@ def _reset(self) -> None:

# Formatting options
self.pre = False
self.table_cell: Optional[Any] = None
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
self.table_cell: Optional[Any] = None
self.table_cell: Optional[_Cell] = None

self.tables: List[Tuple[Any, int, int]] = []
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
self.tables: List[Tuple[Any, int, int]] = []
self.tables: List[Table[Any, int, int]] = []

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since the table is only appended to, we could only keep track of the column and always use the last row. NBD either way.

self.alignment: Optional[int] = None
self.padding_left: Optional[Pt] = None
self.attrs: List[List[Tuple[str, Any]]] = []
Expand All @@ -101,6 +103,38 @@ def finish_p(self) -> None:
self.r.text = self.r.text.rstrip()
self._reset()

def init_table(self, attrs: List[Tuple[str, Optional[str]]]) -> None:
if self.table_cell is not None:
table = self.table_cell.add_table(rows=0, cols=0)
else:
table = self.doc.add_table(rows=0, cols=0)
Comment on lines +107 to +110
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
if self.table_cell is not None:
table = self.table_cell.add_table(rows=0, cols=0)
else:
table = self.doc.add_table(rows=0, cols=0)
container = self.doc if self.table_cell is None else self.table_cell
table = container.add_table(rows=0, cols=0)

self.tables.append((table, -1, -1))

def finish_table(self) -> None:
table = self.tables.pop()[0]
section = self.doc.sections[0]
page_width = section.page_width - section.left_margin - section.right_margin
page_width = int(page_width * (0.5 ** len(self.tables)))
for col in table.columns:
col.width = page_width // len(table.columns)

def init_tr(self) -> None:
table, row, col = self.tables[-1]
row += 1
col = -1
self.tables[-1] = (table, row, col)
table.add_row()

def init_tdth(self) -> None:
table, row, col = self.tables[-1]
col += 1
self.tables[-1] = (table, row, col)
if col >= len(table.columns):
table.add_column(0)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A comment explaining that columns must have a width to form a valid docx would be very helpful. It could also explain that the column width is computed in finish_table, because the total column count is needed to evenly spread the width.

self.table_cell = table.cell(row, col)
self.p = self.table_cell.paragraphs[0]
self.r = None

def init_run(self, attrs: List[Tuple[str, Any]]) -> None:
self.attrs.append(attrs)
if attrs:
Expand Down Expand Up @@ -183,6 +217,12 @@ def handle_starttag(self, tag: str, attrs: List[Tuple[str, Optional[str]]]) -> N
self.init_run([("underline", True)])
elif tag == "ul":
self.add_list_style("List Bullet")
elif tag == "table":
self.init_table(attrs)
elif tag == "tr":
self.init_tr()
elif tag in ["td", "th"]:
self.init_tdth()

def handle_data(self, data: str) -> None:
if not self.pre:
Expand All @@ -208,3 +248,9 @@ def handle_endtag(self, tag: str) -> None:
del self.list_style[-1]
elif tag == "pre":
self.pre = False
elif tag == "table":
self.finish_table()
elif tag in ["td", "th"]:
self.table_cell = None
self.p = None
self.r = None
4 changes: 4 additions & 0 deletions tests/table.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
<table>
<tr><td>1</td><td><b>2</b></td></tr>
<tr><td>3</td></tr>
</table>
7 changes: 7 additions & 0 deletions tests/table_nested.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
<table>
<tr><td>1</td><td>2</td></tr>
<tr><td>3</td><td><table>
<tr><th>4</th><th>5</th></tr>
<tr><td>6</td><td>7</td></tr>
</table></td></tr>
</table>
70 changes: 70 additions & 0 deletions tests/test_table.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
import os

import docx

from html2docx import html2docx

from .utils import TEST_DIR


def test_table():
html_path = os.path.join(TEST_DIR, "table.html")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can test_html2docx be extended to accept tables in the spec?

The specs would need to be updated to specify paragraphs and tables in a document.
Since table cells can contain other tables or multiple paragraphs, we could recursively validate paragraphs and tables in a container (for now, a document or table cell).

html = open(html_path).read()
buf = html2docx(html, title="table")

doc = docx.Document(buf)

assert len(doc.tables) == 1
table = doc.tables[0]

assert len(table.rows) == 2
assert len(table.columns) == 2

contents = [
["1", "2"],
["3"],
]

for r, row in enumerate(contents):
for c, text in enumerate(row):
assert table.cell(r, c).text == text

assert table.cell(0, 0).paragraphs[0].runs[0].font.bold is None
assert table.cell(0, 1).paragraphs[0].runs[0].font.bold is True


def test_table_nested():
html_path = os.path.join(TEST_DIR, "table_nested.html")
html = open(html_path).read()
buf = html2docx(html, title="table")

doc = docx.Document(buf)
assert len(doc.tables) == 1

table = doc.tables[0]

assert len(table.rows) == 2
assert len(table.columns) == 2

contents = [
["1", "2"],
["3"],
]

for r, row in enumerate(contents):
for c, text in enumerate(row):
assert table.cell(r, c).text == text

cell = table.cell(1, 1)
assert len(cell.tables) == 1

table2 = cell.tables[0]

contents2 = [
["4", "5"],
["6", "7"],
]

for r, row in enumerate(contents2):
for c, text in enumerate(row):
assert table2.cell(r, c).text == text