Skip to content

Commit

Permalink
MsDoc Reader: Support for UTF-8 characters (#2664)
Browse files Browse the repository at this point in the history
  • Loading branch information
Progi1984 authored Aug 30, 2024
1 parent 9f755a4 commit b0ed3db
Show file tree
Hide file tree
Showing 6 changed files with 68 additions and 9 deletions.
1 change: 1 addition & 0 deletions docs/changes/1.x/1.3.0.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
- Word2007 Reader : Support for FormFields by [@vincentKool](https://github.com/vincentKool) in [#2653](https://github.com/PHPOffice/PHPWord/pull/2653)
- RTF Writer : Support for Table Border Style fixing [#345](https://github.com/PHPOffice/PHPWord/issues/345) by [@Progi1984](https://github.com/Progi1984) in [#2656](https://github.com/PHPOffice/PHPWord/pull/2656)
- Word2007 Reader: Support the page break (<w:lastRenderedPageBreak/>) by [@stanolacko](https://github.com/stanolacko) in [#2662](https://github.com/PHPOffice/PHPWord/pull/2662)
- MsDoc Reader: Support for UTF-8 characters by [@Progi1984] fixing [#881](https://github.com/PHPOffice/PHPWord/issues/881), [#1454](https://github.com/PHPOffice/PHPWord/issues/1454), [#1817](https://github.com/PHPOffice/PHPWord/issues/1817), [#1927](https://github.com/PHPOffice/PHPWord/issues/1927), [#2383](https://github.com/PHPOffice/PHPWord/issues/2383), [#2565](https://github.com/PHPOffice/PHPWord/issues/2565) in [#2664](https://github.com/PHPOffice/PHPWord/pull/2664)

### Bug fixes

Expand Down
10 changes: 6 additions & 4 deletions src/PhpWord/Reader/MsDoc.php
Original file line number Diff line number Diff line change
Expand Up @@ -1279,10 +1279,12 @@ private function readRecordPlcfBtePapx(): void
break;
}
$strLen = $arrayRGFC[$key + 1] - $arrayRGFC[$key] - 1;
for ($inc = 0; $inc < $strLen; ++$inc) {
$byte = self::getInt1d($this->dataWorkDocument, $arrayRGFC[$key] + $inc);
for ($inc = 0; $inc < ($strLen * 2); ++$inc) {
$byte = self::getInt2d($this->dataWorkDocument, $arrayRGFC[$key] + ($inc * 2));
if ($byte > 0) {
$string .= chr($byte);
$string .= mb_chr($byte, 'UTF-8');
} else {
break;
}
}
}
Expand Down Expand Up @@ -2331,7 +2333,7 @@ private function generatePhpWord(): void
foreach ($this->arrayParagraphs as $itmParagraph) {
$textPara = $itmParagraph;
foreach ($this->arrayCharacters as $oCharacters) {
$subText = substr($textPara, $oCharacters->pos_start, $oCharacters->pos_len);
$subText = mb_substr($textPara, $oCharacters->pos_start, $oCharacters->pos_len);
$subText = str_replace(chr(13), PHP_EOL, $subText);
$arrayText = explode(PHP_EOL, $subText);
if (end($arrayText) == '') {
Expand Down
66 changes: 61 additions & 5 deletions tests/PhpWordTests/Reader/MsDocTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,9 @@
namespace PhpOffice\PhpWordTests\Reader;

use Exception;
use PhpOffice\PhpWord\Element\Text;
use PhpOffice\PhpWord\IOFactory;
use PhpOffice\PhpWord\PhpWord;
use PhpOffice\PhpWord\Reader\MsDoc;

/**
Expand Down Expand Up @@ -50,14 +52,20 @@ public function testCanReadFailed(): void
self::assertFalse($object->canRead($filename));
}

/**
* Load.
*/
public function testLoad(): void
public function testLoadBasic(): void
{
$filename = __DIR__ . '/../_files/documents/reader.doc';
$phpWord = IOFactory::load($filename, 'MsDoc');
self::assertInstanceOf('PhpOffice\\PhpWord\\PhpWord', $phpWord);
self::assertInstanceOf(PhpWord::class, $phpWord);

$sections = $phpWord->getSections();
self::assertCount(1, $sections);
$elements = $sections[0]->getElements();
self::assertArrayHasKey(0, $elements);
/** @var Text $element0 */
$element0 = $elements[0];
self::assertInstanceOf(Text::class, $element0);
self::assertEquals('Welcome to PhpWord', $element0->getText());
}

public function testLoadHalfPointFont(): void
Expand All @@ -76,6 +84,54 @@ public function testLoadHalfPointFont(): void
}
}

public function testLoadChinese(): void
{
$filename = __DIR__ . '/../_files/documents/docChinese.doc';
$phpWord = IOFactory::load($filename, 'MsDoc');
self::assertInstanceOf(PhpWord::class, $phpWord);

$sections = $phpWord->getSections();
self::assertCount(1, $sections);
$elements = $sections[0]->getElements();
self::assertArrayHasKey(0, $elements);
/** @var Text $element0 */
$element0 = $elements[0];
self::assertInstanceOf(Text::class, $element0);
self::assertEquals('OKKI AI 客户案例', $element0->getText());
}

public function testLoadCzech(): void
{
$filename = __DIR__ . '/../_files/documents/docCzech.doc';
$phpWord = IOFactory::load($filename, 'MsDoc');
self::assertInstanceOf(PhpWord::class, $phpWord);

$sections = $phpWord->getSections();
self::assertCount(1, $sections);
$elements = $sections[0]->getElements();
self::assertArrayHasKey(0, $elements);
/** @var Text $element0 */
$element0 = $elements[0];
self::assertInstanceOf(Text::class, $element0);
self::assertEquals('Příliš žluťoučký kůň pěl ďábelské ódy', $element0->getText());
}

public function testLoadSlovak(): void
{
$filename = __DIR__ . '/../_files/documents/docSlovak.doc';
$phpWord = IOFactory::load($filename, 'MsDoc');
self::assertInstanceOf(PhpWord::class, $phpWord);

$sections = $phpWord->getSections();
self::assertCount(1, $sections);
$elements = $sections[0]->getElements();
self::assertArrayHasKey(0, $elements);
/** @var Text $element0 */
$element0 = $elements[0];
self::assertInstanceOf(Text::class, $element0);
self::assertEquals('Pondelok', $element0->getText());
}

/**
* Test exception on not existing file.
*/
Expand Down
Binary file not shown.
Binary file added tests/PhpWordTests/_files/documents/docCzech.doc
Binary file not shown.
Binary file not shown.

0 comments on commit b0ed3db

Please sign in to comment.