From e4d4d99d4a6d863f1f226ae3b4502b9130f618ba Mon Sep 17 00:00:00 2001 From: oleibman <10341515+oleibman@users.noreply.github.com> Date: Sat, 11 Nov 2023 18:43:41 -0800 Subject: [PATCH] Improve ODText Content Reader Fix #2493. There is much that the ODT Reader ignores. This change adds support for the `text:section`, `text:span`, `text:s`, and `text:tab` tags, thereby handling multiple sections, text runs, tab characters, and multiple spaces. There will still be many omissions (e.g. styles and tables), but you will now often be able to access the text content of valid ODT documents. The issue suggests variations in a simple file created on its own by LibreOffice, and a similar file created by PhpWord. Both are unit-tested. A `getText` method is added to TextRun to facilitate testing (and can be useful on its own). It will return the concatenated texts of all elements of the text run. --- phpstan-baseline.neon | 5 -- src/PhpWord/Element/TextRun.php | 12 +++ src/PhpWord/Reader/ODText/Content.php | 80 +++++++++++++++-- .../Reader/ODText/ODTextSectionTest.php | 83 ++++++++++++++++++ .../_files/documents/word.2493.nosection.odt | Bin 0 -> 3066 bytes 5 files changed, 170 insertions(+), 10 deletions(-) create mode 100644 tests/PhpWordTests/Reader/ODText/ODTextSectionTest.php create mode 100644 tests/PhpWordTests/_files/documents/word.2493.nosection.odt diff --git a/phpstan-baseline.neon b/phpstan-baseline.neon index 2dd69c7278..ada2c57cc7 100644 --- a/phpstan-baseline.neon +++ b/phpstan-baseline.neon @@ -165,11 +165,6 @@ parameters: count: 1 path: src/PhpWord/Reader/HTML.php - - - message: "#^Call to an undefined method DOMNode\\:\\:getAttribute\\(\\)\\.$#" - count: 2 - path: src/PhpWord/Reader/ODText/Content.php - - message: "#^Offset 'textNodes' on array\\{changed\\: PhpOffice\\\\PhpWord\\\\Element\\\\TrackChange, textNodes\\: DOMNodeList\\\\} in isset\\(\\) always exists and is not nullable\\.$#" count: 1 diff --git a/src/PhpWord/Element/TextRun.php b/src/PhpWord/Element/TextRun.php index fc8727592a..33c55f6584 100644 --- a/src/PhpWord/Element/TextRun.php +++ b/src/PhpWord/Element/TextRun.php @@ -78,4 +78,16 @@ public function setParagraphStyle($style = null) return $this->paragraphStyle; } + + public function getText(): string + { + $outstr = ''; + foreach ($this->getElements() as $element) { + if ($element instanceof Text) { + $outstr .= $element->getText(); + } + } + + return $outstr; + } } diff --git a/src/PhpWord/Reader/ODText/Content.php b/src/PhpWord/Reader/ODText/Content.php index ccbc5eec96..06c8cefa8c 100644 --- a/src/PhpWord/Reader/ODText/Content.php +++ b/src/PhpWord/Reader/ODText/Content.php @@ -18,6 +18,9 @@ namespace PhpOffice\PhpWord\Reader\ODText; use DateTime; +use DOMElement; +use DOMNodeList; +use PhpOffice\PhpWord\Element\Section; use PhpOffice\PhpWord\Element\TrackChange; use PhpOffice\PhpWord\PhpWord; use PhpOffice\PhpWord\Shared\XMLReader; @@ -29,6 +32,9 @@ */ class Content extends AbstractPart { + /** @var ?Section */ + private $section; + /** * Read content.xml. */ @@ -40,18 +46,31 @@ public function read(PhpWord $phpWord): void $trackedChanges = []; $nodes = $xmlReader->getElements('office:body/office:text/*'); + $this->section = null; + $this->processNodes($nodes, $xmlReader, $phpWord); + $this->section = null; + } + + /** @param DOMNodeList $nodes */ + public function processNodes(DOMNodeList $nodes, XMLReader $xmlReader, PhpWord $phpWord): void + { if ($nodes->length > 0) { - $section = $phpWord->addSection(); foreach ($nodes as $node) { // $styleName = $xmlReader->getAttribute('text:style-name', $node); switch ($node->nodeName) { case 'text:h': // Heading $depth = $xmlReader->getAttribute('text:outline-level', $node); - $section->addTitle($node->nodeValue, $depth); + $this->getSection($phpWord)->addTitle($node->nodeValue, $depth); break; case 'text:p': // Paragraph + $styleName = $xmlReader->getAttribute('text:style-name', $node); + if (substr($styleName, 0, 2) === 'SB') { + break; + } $children = $node->childNodes; + $spans = false; + /** @var DOMElement $child */ foreach ($children as $child) { switch ($child->nodeName) { case 'text:change-start': @@ -71,16 +90,50 @@ public function read(PhpWord $phpWord): void $changed = $trackedChanges[$changeId]; } + break; + case 'text:span': + $spans = true; + break; } } - $element = $section->addText($node->nodeValue); + if ($spans) { + $element = $this->getSection($phpWord)->addTextRun(); + foreach ($children as $child) { + switch ($child->nodeName) { + case 'text:span': + /** @var DOMElement $child2 */ + foreach ($child->childNodes as $child2) { + switch ($child2->nodeName) { + case '#text': + $element->addText($child2->nodeValue); + + break; + case 'text:tab': + $element->addText("\t"); + + break; + case 'text:s': + $spaces = (int) $child2->getAttribute('text:c') ?: 1; + $element->addText(str_repeat(' ', $spaces)); + + break; + } + } + + break; + + } + } + } else { + $element = $this->getSection($phpWord)->addText($node->nodeValue); + } if (isset($changed) && is_array($changed)) { $element->setTrackChange($changed['changed']); if (isset($changed['textNodes'])) { foreach ($changed['textNodes'] as $changedNode) { - $element = $section->addText($changedNode->nodeValue); + $element = $this->getSection($phpWord)->addText($changedNode->nodeValue); $element->setTrackChange($changed['changed']); } } @@ -91,7 +144,7 @@ public function read(PhpWord $phpWord): void $listItems = $xmlReader->getElements('text:list-item/text:p', $node); foreach ($listItems as $listItem) { // $listStyleName = $xmlReader->getAttribute('text:style-name', $listItem); - $section->addListItem($listItem->nodeValue, 0); + $this->getSection($phpWord)->addListItem($listItem->nodeValue, 0); } break; @@ -110,9 +163,26 @@ public function read(PhpWord $phpWord): void $trackedChanges[$changedRegion->getAttribute('text:id')] = ['changed' => $changed, 'textNodes' => $textNodes]; } + break; + case 'text:section': // Section + // $sectionStyleName = $xmlReader->getAttribute('text:style-name', $listItem); + $this->section = $phpWord->addSection(); + $children = $node->childNodes; + $this->processNodes($children, $xmlReader, $phpWord); + break; } } } } + + private function getSection(PhpWord $phpWord): Section + { + $section = $this->section; + if ($section === null) { + $section = $this->section = $phpWord->addSection(); + } + + return $section; + } } diff --git a/tests/PhpWordTests/Reader/ODText/ODTextSectionTest.php b/tests/PhpWordTests/Reader/ODText/ODTextSectionTest.php new file mode 100644 index 0000000000..0a1a4512db --- /dev/null +++ b/tests/PhpWordTests/Reader/ODText/ODTextSectionTest.php @@ -0,0 +1,83 @@ +filename !== '') { + unlink($this->filename); + $this->filename = ''; + } + } + + public function testWriteThenReadSection(): void + { + $dir = 'tests/PhpWordTests/_files'; + Settings::setOutputEscapingEnabled(true); + $phpWord = new PhpWord(); + $section = $phpWord->addSection(); + $inputText = ['days', 'monday', 'tuesday']; + $inputText[] = "Tab\tthen two spaces then done."; + foreach ($inputText as $text) { + $section->addText($text); + } + $writer = IOFactory::createWriter($phpWord, 'ODText'); + $this->filename = "$dir/sectiontest.odt"; + $writer->save($this->filename); + + $reader = IOFactory::createReader('ODText'); + $phpWord2 = $reader->load($this->filename); + $outputText = []; + foreach ($phpWord2->getSections() as $section) { + foreach ($section->getElements() as $element) { + if (is_object($element) && method_exists($element, 'getText')) { + $outputText[] = $element->getText(); + } + } + } + self::assertSame($inputText, $outputText); + } + + public function testReadNoSections(): void + { + $dir = 'tests/PhpWordTests/_files/documents'; + $inputText = ['days', 'monday', 'tuesday']; + + $reader = IOFactory::createReader('ODText'); + $filename = "$dir/word.2493.nosection.odt"; + $phpWord2 = $reader->load($filename); + $outputText = []; + foreach ($phpWord2->getSections() as $section) { + foreach ($section->getElements() as $element) { + if (is_object($element) && method_exists($element, 'getText')) { + $outputText[] = $element->getText(); + } + } + } + self::assertSame($inputText, $outputText); + } +} diff --git a/tests/PhpWordTests/_files/documents/word.2493.nosection.odt b/tests/PhpWordTests/_files/documents/word.2493.nosection.odt new file mode 100644 index 0000000000000000000000000000000000000000..eb0fa2076433c91a29fd7a2b4db0762c80c7e09d GIT binary patch literal 3066 zcmZ`*2{e>#8-9@`OO&OBO8Bxb`Nvjh?1VxllqE4VGPW7Z*o~!#A;!M%24l-MmZXeq zW6ds2L?mR1QOQ62|98Is>-^vSp6A@}dCz@4=Y8J$z3%HW(x*Gd1sp~{05!Ajlw}pB z-XAU|>Vm`IP$bF?YU}3a470OE!dzXXy;8}zdZL>)v^XN`h zu@?kx0juovLPLTy>ov&lD>K)zY>kg^v;tYEfh`B+`LO06`Q+zxqYgBKO;vs20;RT91F9;kdI!~Ccdy|}CJ8SloH!4~zCtfIxLDIpt% ztoO=uc)~w#P2>t$JUKs_>x`DTdJTf;Qr|mq)%aYC&ecW*v9A=LWFhRAYK;o8*LC;H z4PU#J%3_P(yf|IU6thsBZ(>W@B!eDp$5moK7x=2H$Si+`lOmewRV6u%f7(a2!M<2*6PZ~^} zqDiZtqV|4iC}e1JW$72e@_@S>Y^Bfpy!PN}^*&px^rji#myRFSS2*K&s=U1KYg{fn z*j?VBU&nz}Qmhzt49-2Zf3GZ>-;#<`9a)rFx>B6BhXBts7z#ztH{UTq4L{epy|{4k zb^X?AzV6f-#S7cSnEfxpW9ulH7$)M3qK>D+x1Cg*apN(dZ)SDm`>}apN3@yZb&$r- z=!pq&`HZ*2ibl4HZoB6(r7?bqzYu(@do-XG`I{I}52$B1C*QM3WV~mTBX^8yt^Cb>pAN*$ZoVGA7l_`r7O7uT*)M-)0AHU6MW|u>!?~rywoj{VjkG#&532;s z`5F|-E1DfIUdMMnzy0xT#pk(Zh+H;GrN|TCv^T+ll=ZICwnR(4)D~GDyhQ$EVgj9Z z$Z!zX_qn>1 zLfNJZELgg7O|bffeDa?ss~sbGtDeD=NDZ+ETwa{`v{_}>!1>*;Prk1yZcoxgC*GAS zkzncpyGs^)KDS-lJm5NH^#eclqxB;=d~pQmS)fx)+UFi9636cSz!u_0bGk)JQxlWK zi^M z4kbd3=fc^l4-`!A4+z5WX8FOzgF_VpAFti~rtYo% zRq(n9A$PQ%Kdc~~*62<05;!C>A(lP1t?s1UuL9wlV#`7+TM5fhrPAunD_cRrgqKBo zWt%d%YH=K@m4%7rbqQXiIr;KIP>H_Og!T!1SH- zfA0?1)>{#L3;?#NGmn`%^AJdsGZb;uAG2x9v^+<^y4e}#@4CIoR!GTm1rcU0kwKD1 zfrq;>)l2>*#f-R79U4aP_gZUtfvMJ%Zq}05Kbc5kj1`b;W z5dP3&32jH!K=F-}i)nst&hQTpKApSvr7P*eqRtL^A`LdxC+B4@fpiWuMZ`#J$K?wU z#KfKUAUvnr7X|`M%fL)Td6531gfGscdqpskkd_Kn3cHB{I<0~W{ZmSC8kh8yp}hz3EU3Vl2d`* z<<*v z(r>z2kU^l#DRsKZb0X&)FL>ejE(#zg(R&Pn-8p&7P^y|ajQ%mDO!J*4%V;uEWpz`Q zpU-xXTa=q_w}+EeYj;m8_EBLS4SGBb7eT36ydh%QaA%w&!*{Pl%lx!7=x1`TEyO~a zlM;8%ZjCd}nk-&eJ^Q|Gr;_d|_VRT#{e#|D_WbCMyc(8Kdc< zUEn%*%8Ku5LFRgs&>DlW@cP+2r7YgCRKmdMwmf*x#_7jU&Uaz$Mw98_u`}}bSf3;%xxF>e}*>IS0+tgFS;!ke`lip z(XhC9BE89Q?-KrX^%B5PI#|L1s|$=&ClDw1Y3^gm5+Wi$jCoi3Wxcp{E_KO0iQ8V> z+YM=g`hw;|4s^6IWdLyAm7)1-_d@q-8Li=0$p(6wU@KduPb^VVqWj=Zc05rfEAhgkuZ