Skip to content

Commit c509b98

Browse files
authored
Merge pull request #186 from Codeuctivity/main
- DocumentAssembler - only add paragraph when no other block-level elements exist in the table cell - fix bug where conditionals can leave a table cell in an invalid state, causing Word to report an error - emulate Word's whitespace handling and treatment of xml:space="preserve" avoid spurious error and crash on - UnicodeMapper - more comprehensive fix for whitespace handling in UnicodeMapper
2 parents b30d59c + 02a7167 commit c509b98

15 files changed

Lines changed: 387 additions & 99 deletions

OpenXmlPowerTools.Tests/DocumentAssemblerTests.cs

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -107,6 +107,7 @@ public class DaTests
107107
[InlineData("DA264-InvalidRunLevelRepeat.docx", "DA-Data.xml", true)]
108108
[InlineData("DA265-RunLevelRepeatWithWhiteSpaceBefore.docx", "DA-Data.xml", false)]
109109
[InlineData("DA266-RunLevelRepeat-NoData.docx", "DA-Data.xml", true)]
110+
[InlineData("DA268-Block-Conditional-In-Table-Cell.docx", "DA268-data.xml", false)]
110111
public void DA101(string name, string data, bool err)
111112
{
112113
var sourceDir = new DirectoryInfo("../../../../TestFiles/");
@@ -156,6 +157,24 @@ public void DA259(string name, string data, bool err)
156157
Assert.Equal(4, brCount);
157158
}
158159

160+
[Fact]
161+
public void DA240()
162+
{
163+
string name = "DA240-Whitespace.docx";
164+
DA101(name, "DA240-Whitespace.xml", false);
165+
var assembledDocx = new FileInfo(Path.Combine(TestUtil.TempDir.FullName, name.Replace(".docx", "-processed-by-DocumentAssembler.docx")));
166+
WmlDocument afterAssembling = new WmlDocument(assembledDocx.FullName);
167+
168+
// when elements are inserted that begin or end with white space, make sure white space is preserved
169+
string firstParaTextIncorrect = afterAssembling.MainDocumentPart.Element(W.body).Elements(W.p).First().Value;
170+
Assert.Equal("Content may or may not have spaces: he/she; he, she; he and she.", firstParaTextIncorrect);
171+
// warning: XElement.Value returns the string resulting from direct concatenation of all W.t elements. This is fast but ignores
172+
// proper handling of xml:space="preserve" attributes, which Word honors when rendering content. Below we also check
173+
// the result of UnicodeMapper.RunToString, which has been enhanced to take xml:space="preserve" into account.
174+
string firstParaTextCorrect = InnerText(afterAssembling.MainDocumentPart.Element(W.body).Elements(W.p).First());
175+
Assert.Equal("Content may or may not have spaces: he/she; he, she; he and she.", firstParaTextCorrect);
176+
}
177+
159178
[Theory]
160179
[InlineData("DA024-TrackedRevisions.docx", "DA-Data.xml")]
161180
public void DA102_Throws(string name, string data)
@@ -487,6 +506,15 @@ private static string GetDocumentText(WmlDocument document)
487506
private const string WidePngBase64 = "iVBORw0KGgoAAAANSUhEUgAAAZAAAADICAIAAABJdyC1AAACuUlEQVR4nO3UMQ7CQBAEwT3EvxEvXz/BZKalqniCifrs7gAUvGfmnO/TNwBu7H5edxuAfyFYQIZgARmCBWQIFpAhWECGYAEZggVkCBaQIVhAhmABGYIFZAgWkCFYQIZgARmCBWQIFpAhWECGYAEZggVkCBaQIVhAhmABGYIFZAgWkCFYQIZgARmCBWQIFpAhWECGYAEZggVkCBaQIVhAhmABGYIFZAgWkCFYQIZgARmCBWQIFpAhWECGYAEZggVkCBaQIVhAhmABGYIFZAgWkCFYQIZgARmCBWQIFpAhWECGYAEZggVkCBaQIVhAhmABGYIFFzMzu2y8A5u4PQZkIj89BEMEAAAAASUVORK5CYII=";
488507
private const string TallPngBase64 = "iVBORw0KGgoAAAANSUhEUgAAAMgAAAGQCAIAAABkkLjnAAAEF0lEQVR4nO3S0QkCURAEwX1i3mLke0lcI3hVAQzz0Wd3B+72npnzPbfv8mT72devP/CfhEVCWCSERUJYJIRFQlgkhEVCWCSERUJYJIRFQlgkhEVCWCSERUJYJIRFQlgkhEVCWCSERUJYJIRFQlgkhEVCWCSERUJYJIRFQlgkhEVCWCSERUJYJIRFQlgkhEVCWCSERUJYJIRFQlgkhEVCWCSERUJYJIRFQlgkhEVCWCSERUJYJIRFQlgkhEVCWCSERUJYJIRFQlgkhEVCWCSERUJYJIRFQlgkhEVCWCSERUJYJIRFQlgkhEVCWCSERUJYJIRFQlgkhEVCWCSERUJYJIRFQlgkhEVCWCSERUJYJIRFQlgkhEVCWCSERUJYJIRFQlgkhEVCWCSERUJYJIRFQlgkhEVCWCSERUJYJIRFQlgkhEVCWCSERUJYJIRFQlgkhEVCWCSERUJYJIRFQlgkhEVCWCSERUJYJIRFQlgkhEVCWCSERUJYJIRFQlgkhEVCWCSERUJYJIRFQlgkhEVCWCSERUJYJIRFQlgkhEVCWCSERUJYJIRFQlgkhEVCWCSERUJYJIRFQlgkhEVCWCSERUJYJIRFQlgkhEVCWCSERUJYJIRFQlgkhEVCWCSERUJYJIRFQlgkhEVCWCSERUJYJIRFQlgkhEVCWCSERUJYJIRFQlgkhEVCWCSERUJYJIRFQlgkhEVCWCSERUJYJIRFQlgkzu42y8yTXVnDDBu1Y983AAAAAElFTkSuQmCC";
489508
private const string TruncatedGifBase64 = "R0lGODlhyABQAA==";
509+
510+
private static string InnerText(XContainer e)
511+
{
512+
return e.Descendants(W.r)
513+
.Where(r => r.Parent.Name != W.del)
514+
.Select(UnicodeMapper.RunToString)
515+
.StringConcatenate();
516+
}
517+
490518
private static readonly List<string> s_ExpectedErrors = new List<string>()
491519
{
492520
"The 'http://schemas.openxmlformats.org/wordprocessingml/2006/main:evenHBand' attribute is not declared.",

OpenXmlPowerTools.Tests/DocumentBuilderTests.cs

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -472,6 +472,56 @@ public void DB012_NumberingsWithSameAbstractNumbering()
472472
Assert.Equal(3, numberingRoot.Elements(W.num).Count());
473473
}
474474

475+
[Fact]
476+
public void DB012a_NumberingWithZeroIdIsValid()
477+
{
478+
// This document has a numbering definition with a zero id (explicitly indicating "no numbering").
479+
var name = "DB012a-No-Numbering0.docx";
480+
var sourceDir = new DirectoryInfo("../../../../TestFiles/");
481+
var sourceDocx = new FileInfo(Path.Combine(sourceDir.FullName, name));
482+
var sources = new List<Source>()
483+
{
484+
new Source(new WmlDocument(sourceDocx.FullName)),
485+
};
486+
var processedDestDocx = new FileInfo(Path.Combine(TestUtil.TempDir.FullName,
487+
sourceDocx.Name.Replace(".docx", "-processed-by-DocumentBuilder.docx")));
488+
DocumentBuilder.BuildDocument(sources, processedDestDocx.FullName);
489+
Validate(processedDestDocx);
490+
}
491+
492+
[Fact]
493+
public void DB012b_NumberingWithZeroIdWorks()
494+
{
495+
var sourceDir = new DirectoryInfo("../../../../TestFiles/");
496+
var source0 = new FileInfo(Path.Combine(sourceDir.FullName, "DB012a-No-Numbering0.docx"));
497+
var source1 = new FileInfo(Path.Combine(sourceDir.FullName, "DB012a-No-Numbering1.docx"));
498+
var doc1 = new WmlDocument(source0.FullName);
499+
using (var mem = new MemoryStream())
500+
{
501+
mem.Write(doc1.DocumentByteArray, 0, doc1.DocumentByteArray.Length);
502+
using (var doc = WordprocessingDocument.Open(mem, true))
503+
{
504+
var xDoc = doc.MainDocumentPart.GetXDocument();
505+
var frontMatterPara = xDoc.Root.Elements(W.body).Elements(W.p).FirstOrDefault();
506+
frontMatterPara.ReplaceWith(
507+
new XElement(PtOpenXml.Insert,
508+
new XAttribute("Id", "Front")));
509+
doc.MainDocumentPart.PutXDocument();
510+
}
511+
doc1.DocumentByteArray = mem.ToArray();
512+
}
513+
514+
var sources = new List<Source>()
515+
{
516+
new Source(doc1, true),
517+
new Source(new WmlDocument(source1.FullName), "Front"),
518+
};
519+
var processedDestDocx =
520+
new FileInfo(Path.Combine(TestUtil.TempDir.FullName, "DB012b-NumberingWithZeroIdWorks.docx"));
521+
DocumentBuilder.BuildDocument(sources, processedDestDocx.FullName);
522+
Validate(processedDestDocx);
523+
}
524+
475525
[Fact]
476526
public void DB013a_LocalizedStyleIds_Heading()
477527
{

OpenXmlPowerTools.Tests/UnicodeMapperTests.cs

Lines changed: 120 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -153,5 +153,125 @@ public void IgnoresTemporaryLayoutMarkers()
153153
// characters) should exactly match the output of UnicodeMapper:
154154
Assert.Equal(p.Value, actual);
155155
}
156+
157+
private const string PreserveSpacingXmlString =
158+
@"<w:document xmlns:w=""http://schemas.openxmlformats.org/wordprocessingml/2006/main"">
159+
<w:body>
160+
<w:p>
161+
<w:r>
162+
<w:t xml:space=""preserve"">The following space is retained: </w:t>
163+
</w:r>
164+
<w:r>
165+
<w:t>but this one is not: </w:t>
166+
</w:r>
167+
<w:r>
168+
<w:t xml:space=""preserve"">. Similarly these two lines should have only a space between them: </w:t>
169+
</w:r>
170+
<w:r>
171+
<w:t>
172+
Line 1!
173+
Line 2!
174+
</w:t>
175+
</w:r>
176+
</w:p>
177+
</w:body>
178+
</w:document>";
179+
180+
[Fact]
181+
public void HonorsXmlSpace()
182+
{
183+
// This somewhat rudimentary test is superceded by TreatsXmlSpaceLikeWord() below,
184+
// but it has been left in to provide a simple/direct illustration of a couple of
185+
// the specific test cases covered by that more extensive suite.
186+
XDocument partDocument = XDocument.Parse(PreserveSpacingXmlString);
187+
XElement p = partDocument.Descendants(W.p).Last();
188+
string innerText = p.Descendants(W.r)
189+
.Select(UnicodeMapper.RunToString)
190+
.StringConcatenate();
191+
Assert.Equal(@"The following space is retained: but this one is not:. Similarly these two lines should have only a space between them: Line 1! Line 2!", innerText);
192+
}
193+
194+
// Verifies that UnicodeMapper.RunToString interprets whitespace in <w:t> elements
195+
// exactly the way Microsoft Word does, including honoring xml:space="preserve".
196+
// This is essential because RunToString is used by higher‑level features
197+
// (OpenXmlRegex, DocumentAssembler, etc.) that rely on its output to reflect the
198+
// text an end‑user would actually see and edit in Word.
199+
//
200+
// Word accepts a wide range of “valid” DOCX input, but it normalizes that input
201+
// into a canonical form when displaying or saving the document. These tests
202+
// compare RunToString’s output against Word’s canonicalized output to ensure
203+
// that whitespace is treated as semantic content in the same way Word treats it.
204+
[Fact]
205+
public void TreatsXmlSpaceLikeWord()
206+
{
207+
var sourceDir = new System.IO.DirectoryInfo("../../../../TestFiles/");
208+
// Test document: crafted to include many whitespace patterns that Word accepts as valid input
209+
var testDoc = new System.IO.FileInfo(System.IO.Path.Combine(sourceDir.FullName, "UM-Whitespace-test.docx"));
210+
var testWmlDoc = new WmlDocument(testDoc.FullName);
211+
var testParagraphs = testWmlDoc.MainDocumentPart
212+
.Element(W.body)
213+
.Elements(W.p).ToList();
214+
// Canonical document: the same test document after being opened and saved by Word,
215+
// representing Word's own normalized interpretation of that whitespace
216+
var expectedDoc = new System.IO.FileInfo(System.IO.Path.Combine(sourceDir.FullName, "UM-Whitespace-Word-saved.docx"));
217+
var expectedWmlDoc = new WmlDocument(expectedDoc.FullName);
218+
var expectedParagraphs = expectedWmlDoc.MainDocumentPart
219+
.Element(W.body)
220+
.Elements(W.p).ToList();
221+
// Iterate through pairs of paragraphs (test name, test content, expected result)
222+
for (int i = 0; i < testParagraphs.Count - 1; i += 2)
223+
{
224+
var testNameParagraph = testParagraphs[i];
225+
var testContentParagraph = testParagraphs[i + 1];
226+
// Get the test name from the first paragraph
227+
var testName = testNameParagraph.Descendants(W.t)
228+
.Select(t => (string)t)
229+
.StringConcatenate();
230+
// Get the actual result by calling UnicodeMapper.RunToString on the test content runs
231+
var actualResult = testContentParagraph.Descendants(W.r)
232+
.Select(UnicodeMapper.RunToString)
233+
.StringConcatenate();
234+
// Find corresponding expected result paragraph (same index in expected document)
235+
var expectedResult = ExtractExpectedFromWord(expectedParagraphs[i + 1]);
236+
Assert.True(
237+
expectedResult == actualResult,
238+
$"Test '{testName}' failed. Expected: [{expectedResult}] Actual: [{actualResult}]"
239+
);
240+
}
241+
}
242+
243+
// Extracts the expected text from Word’s canonicalized output for the whitespace tests.
244+
// This helper intentionally handles *only* the constructs that Word emits in the saved
245+
// version of UM-whitespace-test.docx:
246+
// • <w:t> → literal text
247+
// • <w:tab/> → '\t'
248+
// • <w:lastRenderedPageBreak/> (intentionally ignored)
249+
// If any other run-level element appears, it means Word has emitted something this test
250+
// was not designed to handle, and the test fails loudly. This prevents the helper
251+
// from drifting toward reimplementing UnicodeMapper.RunToString.
252+
private static string ExtractExpectedFromWord(XElement p)
253+
{
254+
var sb = new System.Text.StringBuilder();
255+
foreach (var run in p.Elements(W.r))
256+
{
257+
foreach (var child in run.Elements())
258+
{
259+
if (child.Name == W.t)
260+
{
261+
sb.Append((string)child);
262+
}
263+
else if (child.Name == W.tab)
264+
{
265+
sb.Append('\t');
266+
}
267+
else if (child.Name != W.lastRenderedPageBreak)
268+
{
269+
throw new System.InvalidOperationException(
270+
$"Unexpected element <{child.Name.LocalName}> encountered in expected Word output.");
271+
}
272+
}
273+
}
274+
return sb.ToString();
275+
}
156276
}
157277
}

OpenXmlPowerTools/DocumentAssembler/DocumentAssembler.cs

Lines changed: 31 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -654,7 +654,7 @@ private class RunReplacementInfo
654654
p.Add(new XElement(W.r,
655655
para.Elements(W.r).Elements(W.rPr).FirstOrDefault(),
656656
(p.Elements().Count() > 1) ? new XElement(W.br) : null,
657-
new XElement(W.t, line)));
657+
new XElement(W.t, GetXmlSpaceAttribute(line), line)));
658658
}
659659
return p;
660660
}
@@ -666,7 +666,7 @@ private class RunReplacementInfo
666666
list.Add(new XElement(W.r,
667667
run.Elements().Where(e => e.Name != W.t),
668668
(list.Count > 0) ? new XElement(W.br) : null,
669-
new XElement(W.t, line)));
669+
new XElement(W.t, GetXmlSpaceAttribute(line), line)));
670670
}
671671
return list;
672672
}
@@ -873,9 +873,24 @@ private class RunReplacementInfo
873873
}
874874
return null;
875875
}
876+
var transformedNodes = element.Nodes().Select(n => ContentReplacementTransform(n, data, templateError, owningPart));
877+
if (element.Name == W.tc)
878+
{
879+
// Check if the table cell contains any block-level elements
880+
// Valid block-level elements in a table cell: p (paragraph), tbl (table), sdt (structured document tag), customXml
881+
var nodesList = transformedNodes.ToList();
882+
var hasBlockLevelContent = nodesList.Any(n => n is XElement xe &&
883+
(xe.Name == W.p || xe.Name == W.tbl || xe.Name == W.sdt || xe.Name == W.customXml));
884+
if (!hasBlockLevelContent)
885+
{
886+
// Table cells must contain at least one block-level element -- add an empty paragraph
887+
nodesList.Add(new XElement(W.p));
888+
}
889+
transformedNodes = nodesList;
890+
}
876891
return new XElement(element.Name,
877892
element.Attributes(),
878-
element.Nodes().Select(n => ContentReplacementTransform(n, data, templateError, owningPart)));
893+
transformedNodes);
879894
}
880895
return node;
881896
}
@@ -1400,5 +1415,18 @@ private static string EvaluateXPathToString(XElement element, string xPath, bool
14001415

14011416
return xPathSelectResult.ToString();
14021417
}
1418+
1419+
private static XAttribute GetXmlSpaceAttribute(string textOfTextElement)
1420+
{
1421+
if (!string.IsNullOrEmpty(textOfTextElement))
1422+
{
1423+
if (char.IsWhiteSpace(textOfTextElement[0]) ||
1424+
char.IsWhiteSpace(textOfTextElement[textOfTextElement.Length - 1]))
1425+
{
1426+
return new XAttribute(XNamespace.Xml + "space", "preserve");
1427+
}
1428+
}
1429+
return null;
1430+
}
14031431
}
14041432
}

0 commit comments

Comments
 (0)