This repository has been archived by the owner on Sep 22, 2022. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 32
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
cmg_anton_nikolaev
committed
Sep 16, 2015
1 parent
1d697e0
commit 9f5f512
Showing
6 changed files
with
573 additions
and
7 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1 @@ | ||
assets/* | ||
!assets/.gitignore | ||
protected/runtime/* | ||
!protected/runtime/.gitignore | ||
protected/data/*.db | ||
themes/classic/views/ | ||
/.idea |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,2 +1,112 @@ | ||
# pdf-to-html | ||
# PDF to HTML PHP Class | ||
|
||
This PHP class can convert your pdf files to html using poppler-utils. | ||
|
||
## Thanks | ||
|
||
Big thanks Mochamad Gufron ([mgufrone](https://github.com/mgufrone))! I did a packet based on its package (https://github.com/mgufrone/pdf-to-html). | ||
|
||
## Important Notes | ||
|
||
Please see how to use below. | ||
|
||
## Installation | ||
|
||
When you are in your active directory apps, you can just run this command to add this package on your app | ||
|
||
``` | ||
composer require tonchik-tm/pdf-to-html:~1 | ||
``` | ||
|
||
Or add this package to your `composer.json` | ||
|
||
```json | ||
{ | ||
"tonchik-tm/pdf-to-html":"~1" | ||
} | ||
``` | ||
|
||
## Requirements | ||
### 1. Install Poppler-Utils | ||
|
||
**Debian/Ubuntu** | ||
```bash | ||
sudo apt-get install poppler-utils | ||
``` | ||
|
||
**Mac OS X** | ||
```bash | ||
brew install poppler | ||
``` | ||
|
||
**Windows** | ||
|
||
For those who need this package in windows, there is a way. First download poppler-utils for windows here <http://blog.alivate.com.au/poppler-windows/>. And download the latest binary. | ||
|
||
After download it, extract it. | ||
|
||
### 2. We need to know where is utility | ||
|
||
**Debian/Ubuntu** | ||
```bash | ||
$ whereis pdftohtml | ||
pdftohtml: /usr/bin/pdftohtml | ||
|
||
$ whereis pdfinfo | ||
pdfinfo: /usr/bin/pdfinfo | ||
``` | ||
|
||
**Mac OS X** | ||
```bash | ||
$ which pdfinfo | ||
/usr/local/bin/pdfinfo | ||
|
||
$ which pdftohtml | ||
/usr/local/bin/pdfinfo | ||
``` | ||
|
||
**Windows** | ||
|
||
Go in extracted directory. There will be a directory called `bin`. We will need this one. | ||
|
||
### 3. PHP Configuration with shell access enabled | ||
|
||
## Usage | ||
|
||
Unix example: | ||
|
||
```php | ||
<?php | ||
// if you are using composer, just use this | ||
include 'vendor/autoload.php'; | ||
|
||
// initiate | ||
$pdf = new \TonchikTm\PdfToHtml\Pdf('test.pdf', [ | ||
'pdftohtml_path' => '/usr/bin/pdftohtml', | ||
'pdfinfo_path' => '/usr/bin/pdfinfo' | ||
]); | ||
|
||
// example for windows | ||
// $pdf = new \TonchikTm\PdfToHtml\Pdf('test.pdf', [ | ||
// 'pdftohtml_path' => '/path/to/poppler/bin/pdftohtml.exe', | ||
// 'pdfinfo_path' => '/path/to/poppler/bin/pdfinfo.exe' | ||
// ]); | ||
|
||
// get pdf info | ||
$pdfInfo = $pdf->getInfo(); | ||
|
||
// get count pages | ||
$countPages = $pdf->countPages(); | ||
|
||
// get content from one page | ||
$contentFirstPage = $pdf->getHtml()->getPage(1); | ||
|
||
// get content from all pages and loop for they | ||
foreach ($pdf->getHtml()->getAllPages() as $page) { | ||
echo $page . '<br/>'; | ||
} | ||
``` | ||
|
||
## Feedback & Contribute | ||
|
||
Send me an issue for improvement or any buggy thing. I love to help and solve another people problems. Thanks :+1: |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
{ | ||
"name": "tonchik-tm/pdf-to-html", | ||
"description": "This PHP class can convert your pdf files to html using poppler-utils.", | ||
"license": "GNU GENERAL PUBLIC LICENSE", | ||
"authors": [ | ||
{ | ||
"name": "Anton Nikolaev", | ||
"email": "[email protected]", | ||
"homepage": "http://www.tonchik.net", | ||
"role": "Developer" | ||
} | ||
], | ||
"minimum-stability": "stable", | ||
"require": { | ||
"pelago/emogrifier": "@dev", | ||
"symfony/css-selector": "^2.7" | ||
}, | ||
|
||
"autoload": { | ||
"psr-0": { | ||
"TonchikTm": "src" | ||
}, | ||
"psr-4": { | ||
"TonchikTm\\PdfToHtml\\": "src/" | ||
} | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,95 @@ | ||
<?php | ||
/** | ||
* Created by PhpStorm. | ||
* User: tonchik™ | ||
* Date: 16.09.2015 | ||
* Time: 13:00 | ||
*/ | ||
|
||
namespace TonchikTm\PdfToHtml; | ||
|
||
/** | ||
* This is base class with common properties and methods. | ||
* | ||
* @property string $outputDir | ||
* @property array $options | ||
* @property array $defaultOptions | ||
*/ | ||
class Base | ||
{ | ||
private $outputDir = ''; | ||
private $options = []; | ||
|
||
/** | ||
* Get all options or one option by key. | ||
* @param string|null $key | ||
* @return array|null | ||
*/ | ||
public function getOptions($key=null) | ||
{ | ||
if ($key) { | ||
return isset($this->options[$key]) ? $this->options[$key] : null; | ||
} else { | ||
return $this->options; | ||
} | ||
} | ||
|
||
/** | ||
* Set options as array or pair key-value. | ||
* @param $key | ||
* @param string|null $value | ||
*/ | ||
public function setOptions($key, $value=null) | ||
{ | ||
if (is_array($key)) { | ||
$this->options = array_merge($this->options, $key); | ||
} elseif (is_string($key)) { | ||
$this->options[$key] = $value; | ||
} | ||
} | ||
|
||
/** | ||
* Get output dir. | ||
* @return string | ||
*/ | ||
public function getOutputDir() | ||
{ | ||
return $this->outputDir; | ||
} | ||
|
||
/** | ||
* Set output dir. | ||
* @param string $dir | ||
* @return $this | ||
*/ | ||
public function setOutputDir($dir) | ||
{ | ||
$this->setOptions('outputDir', $dir); | ||
$this->outputDir = $dir; | ||
return $this; | ||
} | ||
|
||
/** | ||
* Clear all files that has been generated by pdftohtml. | ||
* Make sure directory ONLY contain generated files from pdftohtml, | ||
* because it remove all contents under preserved output directory | ||
* @param bool|false $removeSelf | ||
* @return $this | ||
*/ | ||
public function clearOutputDir($removeSelf=false) | ||
{ | ||
$files = new \RecursiveIteratorIterator(new \RecursiveDirectoryIterator($this->getOutputDir(), \FilesystemIterator::SKIP_DOTS)); | ||
foreach($files as $file) { | ||
$path = (string)$file; | ||
$basename = basename($path); | ||
if($basename != '..') { | ||
if(is_file($path) && file_exists($path)) | ||
unlink($path); | ||
elseif(is_dir($path) && file_exists($path)) | ||
rmdir($path); | ||
} | ||
} | ||
if ($removeSelf) rmdir($this->getOutputDir()); | ||
return $this; | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,137 @@ | ||
<?php | ||
/** | ||
* Created by PhpStorm. | ||
* User: tonchik™ | ||
* Date: 15.09.2015 | ||
* Time: 19:18 | ||
*/ | ||
|
||
namespace TonchikTm\PdfToHtml; | ||
|
||
use DOMDocument; | ||
use DOMXPath; | ||
use Pelago\Emogrifier; | ||
|
||
/** | ||
* This class creates a collection of html pages with some improvements. | ||
* | ||
* @property integer $pages | ||
* @property string[] $content | ||
*/ | ||
class Html extends Base | ||
{ | ||
private $pages = 0; | ||
private $content = []; | ||
|
||
private $defaultOptions = [ | ||
'inlineCss' => true, | ||
'inlineImages' => true, | ||
'onlyContent' => false, | ||
'outputDir' => '' | ||
]; | ||
|
||
public function __construct($options=[]) | ||
{ | ||
$this->setOptions(array_merge($this->defaultOptions, $options)); | ||
} | ||
|
||
/** | ||
* Add page to collection with the conversion, according to options. | ||
* @param integer $number | ||
* @param string $content | ||
* @return $this | ||
*/ | ||
public function addPage($number, $content) | ||
{ | ||
if ($this->getOptions('inlineCss')) { | ||
$content = $this->setInlineCss($content); | ||
} | ||
|
||
if ($this->getOptions('inlineImages')) { | ||
$content = $this->setInlineImages($content); | ||
} | ||
|
||
if ($this->getOptions('onlyContent')) { | ||
$content = $this->setOnlyContent($content); | ||
} | ||
|
||
$this->content[$number] = $content; | ||
$this->pages = count($this->content[$number]); | ||
return $this; | ||
} | ||
|
||
/** | ||
* @param $number | ||
* @return string|null | ||
*/ | ||
public function getPage($number) | ||
{ | ||
return isset($this->content[$number]) ? $this->content[$number] : null; | ||
} | ||
|
||
/** | ||
* @return array | ||
*/ | ||
public function getAllPages() | ||
{ | ||
return $this->content; | ||
} | ||
|
||
/** | ||
* The method replaces css class to inline css rules. | ||
* @param $content | ||
* @return string | ||
*/ | ||
private function setInlineCss($content) | ||
{ | ||
$content = str_replace(['<!--', '-->'], '', $content); | ||
$parser = new Emogrifier($content); | ||
return $parser->emogrify(); | ||
} | ||
|
||
/** | ||
* The method looks for images in html and replaces the src attribute to base64 hash. | ||
* @param string $content | ||
* @return string | ||
*/ | ||
private function setInlineImages($content) | ||
{ | ||
$dom = new DOMDocument(); | ||
$dom->loadHTML($content); | ||
$xpath = new DOMXPath($dom); | ||
$xpath->registerNamespace("xml", "http://www.w3.org/1999/xhtml"); | ||
|
||
$images = $xpath->query("//img"); | ||
foreach ($images as $img) { /** @var \DOMNode $img */ | ||
$attrImage = $img->getAttribute('src'); | ||
$pi = pathinfo($attrImage); | ||
$image = $this->getOutputDir() . '/' . $pi['basename']; | ||
$imageData = base64_encode(file_get_contents($image)); | ||
$src = 'data: ' . mime_content_type($image) . ';base64,' . $imageData; | ||
$content = str_replace($attrImage, $src, $content); | ||
} | ||
unset($dom, $xpath, $images, $imageData); | ||
return $content; | ||
} | ||
|
||
/** | ||
* The method takes from html body content only. | ||
* @param string $content | ||
* @return string | ||
*/ | ||
private function setOnlyContent($content) | ||
{ | ||
$dom = new DOMDocument(); | ||
$dom->loadHTML($content); | ||
$xpath = new DOMXPath($dom); | ||
$xpath->registerNamespace("xml", "http://www.w3.org/1999/xhtml"); | ||
|
||
$html = ''; | ||
$body = $xpath->query("//body")->item(0); | ||
foreach($body->childNodes as $node) { | ||
$html .= $dom->saveHTML($node); | ||
} | ||
unset($dom, $xpath, $body, $content); | ||
return trim($html); | ||
} | ||
} |
Oops, something went wrong.