Skip to content
This repository has been archived by the owner on Sep 22, 2022. It is now read-only.

Commit

Permalink
init version 1.0.0
Browse files Browse the repository at this point in the history
  • Loading branch information
cmg_anton_nikolaev committed Sep 16, 2015
1 parent 1d697e0 commit 9f5f512
Show file tree
Hide file tree
Showing 6 changed files with 573 additions and 7 deletions.
7 changes: 1 addition & 6 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,6 +1 @@
assets/*
!assets/.gitignore
protected/runtime/*
!protected/runtime/.gitignore
protected/data/*.db
themes/classic/views/
/.idea
112 changes: 111 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,2 +1,112 @@
# pdf-to-html
# PDF to HTML PHP Class

This PHP class can convert your pdf files to html using poppler-utils.

## Thanks

Big thanks Mochamad Gufron ([mgufrone](https://github.com/mgufrone))! I did a packet based on its package (https://github.com/mgufrone/pdf-to-html).

## Important Notes

Please see how to use below.

## Installation

When you are in your active directory apps, you can just run this command to add this package on your app

```
composer require tonchik-tm/pdf-to-html:~1
```

Or add this package to your `composer.json`

```json
{
"tonchik-tm/pdf-to-html":"~1"
}
```

## Requirements
### 1. Install Poppler-Utils

**Debian/Ubuntu**
```bash
sudo apt-get install poppler-utils
```

**Mac OS X**
```bash
brew install poppler
```

**Windows**

For those who need this package in windows, there is a way. First download poppler-utils for windows here <http://blog.alivate.com.au/poppler-windows/>. And download the latest binary.

After download it, extract it.

### 2. We need to know where is utility

**Debian/Ubuntu**
```bash
$ whereis pdftohtml
pdftohtml: /usr/bin/pdftohtml

$ whereis pdfinfo
pdfinfo: /usr/bin/pdfinfo
```

**Mac OS X**
```bash
$ which pdfinfo
/usr/local/bin/pdfinfo

$ which pdftohtml
/usr/local/bin/pdfinfo
```

**Windows**

Go in extracted directory. There will be a directory called `bin`. We will need this one.

### 3. PHP Configuration with shell access enabled

## Usage

Unix example:

```php
<?php
// if you are using composer, just use this
include 'vendor/autoload.php';

// initiate
$pdf = new \TonchikTm\PdfToHtml\Pdf('test.pdf', [
'pdftohtml_path' => '/usr/bin/pdftohtml',
'pdfinfo_path' => '/usr/bin/pdfinfo'
]);

// example for windows
// $pdf = new \TonchikTm\PdfToHtml\Pdf('test.pdf', [
// 'pdftohtml_path' => '/path/to/poppler/bin/pdftohtml.exe',
// 'pdfinfo_path' => '/path/to/poppler/bin/pdfinfo.exe'
// ]);

// get pdf info
$pdfInfo = $pdf->getInfo();

// get count pages
$countPages = $pdf->countPages();

// get content from one page
$contentFirstPage = $pdf->getHtml()->getPage(1);

// get content from all pages and loop for they
foreach ($pdf->getHtml()->getAllPages() as $page) {
echo $page . '<br/>';
}
```

## Feedback & Contribute

Send me an issue for improvement or any buggy thing. I love to help and solve another people problems. Thanks :+1:
27 changes: 27 additions & 0 deletions composer.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
{
"name": "tonchik-tm/pdf-to-html",
"description": "This PHP class can convert your pdf files to html using poppler-utils.",
"license": "GNU GENERAL PUBLIC LICENSE",
"authors": [
{
"name": "Anton Nikolaev",
"email": "[email protected]",
"homepage": "http://www.tonchik.net",
"role": "Developer"
}
],
"minimum-stability": "stable",
"require": {
"pelago/emogrifier": "@dev",
"symfony/css-selector": "^2.7"
},

"autoload": {
"psr-0": {
"TonchikTm": "src"
},
"psr-4": {
"TonchikTm\\PdfToHtml\\": "src/"
}
}
}
95 changes: 95 additions & 0 deletions src/Base.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
<?php
/**
* Created by PhpStorm.
* User: tonchik™
* Date: 16.09.2015
* Time: 13:00
*/

namespace TonchikTm\PdfToHtml;

/**
* This is base class with common properties and methods.
*
* @property string $outputDir
* @property array $options
* @property array $defaultOptions
*/
class Base
{
private $outputDir = '';
private $options = [];

/**
* Get all options or one option by key.
* @param string|null $key
* @return array|null
*/
public function getOptions($key=null)
{
if ($key) {
return isset($this->options[$key]) ? $this->options[$key] : null;
} else {
return $this->options;
}
}

/**
* Set options as array or pair key-value.
* @param $key
* @param string|null $value
*/
public function setOptions($key, $value=null)
{
if (is_array($key)) {
$this->options = array_merge($this->options, $key);
} elseif (is_string($key)) {
$this->options[$key] = $value;
}
}

/**
* Get output dir.
* @return string
*/
public function getOutputDir()
{
return $this->outputDir;
}

/**
* Set output dir.
* @param string $dir
* @return $this
*/
public function setOutputDir($dir)
{
$this->setOptions('outputDir', $dir);
$this->outputDir = $dir;
return $this;
}

/**
* Clear all files that has been generated by pdftohtml.
* Make sure directory ONLY contain generated files from pdftohtml,
* because it remove all contents under preserved output directory
* @param bool|false $removeSelf
* @return $this
*/
public function clearOutputDir($removeSelf=false)
{
$files = new \RecursiveIteratorIterator(new \RecursiveDirectoryIterator($this->getOutputDir(), \FilesystemIterator::SKIP_DOTS));
foreach($files as $file) {
$path = (string)$file;
$basename = basename($path);
if($basename != '..') {
if(is_file($path) && file_exists($path))
unlink($path);
elseif(is_dir($path) && file_exists($path))
rmdir($path);
}
}
if ($removeSelf) rmdir($this->getOutputDir());
return $this;
}
}
137 changes: 137 additions & 0 deletions src/Html.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
<?php
/**
* Created by PhpStorm.
* User: tonchik™
* Date: 15.09.2015
* Time: 19:18
*/

namespace TonchikTm\PdfToHtml;

use DOMDocument;
use DOMXPath;
use Pelago\Emogrifier;

/**
* This class creates a collection of html pages with some improvements.
*
* @property integer $pages
* @property string[] $content
*/
class Html extends Base
{
private $pages = 0;
private $content = [];

private $defaultOptions = [
'inlineCss' => true,
'inlineImages' => true,
'onlyContent' => false,
'outputDir' => ''
];

public function __construct($options=[])
{
$this->setOptions(array_merge($this->defaultOptions, $options));
}

/**
* Add page to collection with the conversion, according to options.
* @param integer $number
* @param string $content
* @return $this
*/
public function addPage($number, $content)
{
if ($this->getOptions('inlineCss')) {
$content = $this->setInlineCss($content);
}

if ($this->getOptions('inlineImages')) {
$content = $this->setInlineImages($content);
}

if ($this->getOptions('onlyContent')) {
$content = $this->setOnlyContent($content);
}

$this->content[$number] = $content;
$this->pages = count($this->content[$number]);
return $this;
}

/**
* @param $number
* @return string|null
*/
public function getPage($number)
{
return isset($this->content[$number]) ? $this->content[$number] : null;
}

/**
* @return array
*/
public function getAllPages()
{
return $this->content;
}

/**
* The method replaces css class to inline css rules.
* @param $content
* @return string
*/
private function setInlineCss($content)
{
$content = str_replace(['<!--', '-->'], '', $content);
$parser = new Emogrifier($content);
return $parser->emogrify();
}

/**
* The method looks for images in html and replaces the src attribute to base64 hash.
* @param string $content
* @return string
*/
private function setInlineImages($content)
{
$dom = new DOMDocument();
$dom->loadHTML($content);
$xpath = new DOMXPath($dom);
$xpath->registerNamespace("xml", "http://www.w3.org/1999/xhtml");

$images = $xpath->query("//img");
foreach ($images as $img) { /** @var \DOMNode $img */
$attrImage = $img->getAttribute('src');
$pi = pathinfo($attrImage);
$image = $this->getOutputDir() . '/' . $pi['basename'];
$imageData = base64_encode(file_get_contents($image));
$src = 'data: ' . mime_content_type($image) . ';base64,' . $imageData;
$content = str_replace($attrImage, $src, $content);
}
unset($dom, $xpath, $images, $imageData);
return $content;
}

/**
* The method takes from html body content only.
* @param string $content
* @return string
*/
private function setOnlyContent($content)
{
$dom = new DOMDocument();
$dom->loadHTML($content);
$xpath = new DOMXPath($dom);
$xpath->registerNamespace("xml", "http://www.w3.org/1999/xhtml");

$html = '';
$body = $xpath->query("//body")->item(0);
foreach($body->childNodes as $node) {
$html .= $dom->saveHTML($node);
}
unset($dom, $xpath, $body, $content);
return trim($html);
}
}
Loading

0 comments on commit 9f5f512

Please sign in to comment.