init version 1.0.0

tonchik-tm · Sep 16, 2015 · 9f5f512 · 9f5f512
1 parent 1d697e0
commit 9f5f512
Show file tree

Hide file tree

Showing 6 changed files with 573 additions and 7 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,6 +1 @@
-assets/*
-!assets/.gitignore
-protected/runtime/*
-!protected/runtime/.gitignore
-protected/data/*.db
-themes/classic/views/
+/.idea
diff --git a/README.md b/README.md
@@ -1,2 +1,112 @@
-# pdf-to-html
+# PDF to HTML PHP Class
+
 This PHP class can convert your pdf files to html using poppler-utils.
+
+## Thanks
+
+Big thanks Mochamad Gufron ([mgufrone](https://github.com/mgufrone))! I did a packet based on its package (https://github.com/mgufrone/pdf-to-html).
+
+## Important Notes
+
+Please see how to use below.
+
+## Installation
+
+When you are in your active directory apps, you can just run this command to add this package on your app
+
+```
+	composer require tonchik-tm/pdf-to-html:~1
+```
+
+Or add this package to your `composer.json`
+
+```json
+{
+	"tonchik-tm/pdf-to-html":"~1"
+}
+```
+
+## Requirements
+### 1. Install Poppler-Utils
+
+**Debian/Ubuntu**
+```bash
+sudo apt-get install poppler-utils
+```
+
+**Mac OS X**
+```bash
+brew install poppler
+```
+
+**Windows**
+
+For those who need this package in windows, there is a way. First download poppler-utils for windows here <http://blog.alivate.com.au/poppler-windows/>. And download the latest binary.
+
+After download it, extract it.
+
+### 2. We need to know where is utility
+
+**Debian/Ubuntu**
+```bash
+$ whereis pdftohtml
+pdftohtml: /usr/bin/pdftohtml
+
+$ whereis pdfinfo
+pdfinfo: /usr/bin/pdfinfo
+```
+
+**Mac OS X**
+```bash
+$ which pdfinfo
+/usr/local/bin/pdfinfo
+
+$ which pdftohtml
+/usr/local/bin/pdfinfo
+```
+
+**Windows**
+
+Go in extracted directory. There will be a directory called `bin`. We will need this one.
+
+### 3. PHP Configuration with shell access enabled
+
+## Usage
+
+Unix example:
+
+```php
+<?php
+// if you are using composer, just use this
+include 'vendor/autoload.php';
+
+// initiate
+$pdf = new \TonchikTm\PdfToHtml\Pdf('test.pdf', [
+    'pdftohtml_path' => '/usr/bin/pdftohtml',
+    'pdfinfo_path' => '/usr/bin/pdfinfo'
+]);
+
+// example for windows
+// $pdf = new \TonchikTm\PdfToHtml\Pdf('test.pdf', [
+//     'pdftohtml_path' => '/path/to/poppler/bin/pdftohtml.exe',
+//     'pdfinfo_path' => '/path/to/poppler/bin/pdfinfo.exe'
+// ]);
+
+// get pdf info
+$pdfInfo = $pdf->getInfo();
+
+// get count pages
+$countPages = $pdf->countPages();
+
+// get content from one page
+$contentFirstPage = $pdf->getHtml()->getPage(1);
+
+// get content from all pages and loop for they
+foreach ($pdf->getHtml()->getAllPages() as $page) {
+    echo $page . '<br/>';
+}
+```
+
+## Feedback & Contribute
+
+Send me an issue for improvement or any buggy thing. I love to help and solve another people problems. Thanks :+1:
diff --git a/composer.json b/composer.json
@@ -0,0 +1,27 @@
+{
+  "name": "tonchik-tm/pdf-to-html",
+  "description": "This PHP class can convert your pdf files to html using poppler-utils.",
+  "license": "GNU GENERAL PUBLIC LICENSE",
+  "authors": [
+    {
+      "name": "Anton Nikolaev",
+      "email": "[email protected]",
+      "homepage": "http://www.tonchik.net",
+      "role": "Developer"
+    }
+  ],
+  "minimum-stability": "stable",
+  "require": {
+    "pelago/emogrifier": "@dev",
+    "symfony/css-selector": "^2.7"
+  },
+
+  "autoload": {
+    "psr-0": {
+      "TonchikTm": "src"
+    },
+    "psr-4": {
+      "TonchikTm\\PdfToHtml\\": "src/"
+    }
+  }
+}
diff --git a/src/Base.php b/src/Base.php
@@ -0,0 +1,95 @@
+<?php
+/**
+ * Created by PhpStorm.
+ * User: tonchik™
+ * Date: 16.09.2015
+ * Time: 13:00
+ */
+
+namespace TonchikTm\PdfToHtml;
+
+/**
+ * This is base class with common properties and methods.
+ *
+ * @property string $outputDir
+ * @property array $options
+ * @property array $defaultOptions
+ */
+class Base
+{
+    private $outputDir = '';
+    private $options = [];
+
+    /**
+     * Get all options or one option by key.
+     * @param string|null $key
+     * @return array|null
+     */
+    public function getOptions($key=null)
+    {
+        if ($key) {
+            return isset($this->options[$key]) ? $this->options[$key] : null;
+        } else {
+            return $this->options;
+        }
+    }
+
+    /**
+     * Set options as array or pair key-value.
+     * @param $key
+     * @param string|null $value
+     */
+    public function setOptions($key, $value=null)
+    {
+        if (is_array($key)) {
+            $this->options = array_merge($this->options, $key);
+        } elseif (is_string($key)) {
+            $this->options[$key] = $value;
+        }
+    }
+
+    /**
+     * Get output dir.
+     * @return string
+     */
+    public function getOutputDir()
+    {
+        return $this->outputDir;
+    }
+
+    /**
+     * Set output dir.
+     * @param string $dir
+     * @return $this
+     */
+    public function setOutputDir($dir)
+    {
+        $this->setOptions('outputDir', $dir);
+        $this->outputDir = $dir;
+        return $this;
+    }
+
+    /**
+     * Clear all files that has been generated by pdftohtml.
+     * Make sure directory ONLY contain generated files from pdftohtml,
+     * because it remove all contents under preserved output directory
+     * @param bool|false $removeSelf
+     * @return $this
+     */
+    public function clearOutputDir($removeSelf=false)
+    {
+        $files = new \RecursiveIteratorIterator(new \RecursiveDirectoryIterator($this->getOutputDir(), \FilesystemIterator::SKIP_DOTS));
+        foreach($files as $file) {
+            $path = (string)$file;
+            $basename = basename($path);
+            if($basename != '..') {
+                if(is_file($path) && file_exists($path))
+                    unlink($path);
+                elseif(is_dir($path) && file_exists($path))
+                    rmdir($path);
+            }
+        }
+        if ($removeSelf) rmdir($this->getOutputDir());
+        return $this;
+    }
+}
diff --git a/src/Html.php b/src/Html.php
@@ -0,0 +1,137 @@
+<?php
+/**
+ * Created by PhpStorm.
+ * User: tonchik™
+ * Date: 15.09.2015
+ * Time: 19:18
+ */
+
+namespace TonchikTm\PdfToHtml;
+
+use DOMDocument;
+use DOMXPath;
+use Pelago\Emogrifier;
+
+/**
+ * This class creates a collection of html pages with some improvements.
+ *
+ * @property integer $pages
+ * @property string[] $content
+ */
+class Html extends Base
+{
+    private $pages = 0;
+    private $content = [];
+
+    private $defaultOptions = [
+        'inlineCss' => true,
+        'inlineImages' => true,
+        'onlyContent' => false,
+        'outputDir' => ''
+    ];
+
+    public function __construct($options=[])
+    {
+        $this->setOptions(array_merge($this->defaultOptions, $options));
+    }
+
+    /**
+     * Add page to collection with the conversion, according to options.
+     * @param integer $number
+     * @param string $content
+     * @return $this
+     */
+    public function addPage($number, $content)
+    {
+        if ($this->getOptions('inlineCss')) {
+            $content = $this->setInlineCss($content);
+        }
+
+        if ($this->getOptions('inlineImages')) {
+            $content = $this->setInlineImages($content);
+        }
+
+        if ($this->getOptions('onlyContent')) {
+            $content = $this->setOnlyContent($content);
+        }
+
+        $this->content[$number] = $content;
+        $this->pages = count($this->content[$number]);
+        return $this;
+    }
+
+    /**
+     * @param $number
+     * @return string|null
+     */
+    public function getPage($number)
+    {
+        return isset($this->content[$number]) ? $this->content[$number] : null;
+    }
+
+    /**
+     * @return array
+     */
+    public function getAllPages()
+    {
+        return $this->content;
+    }
+
+    /**
+     * The method replaces css class to inline css rules.
+     * @param $content
+     * @return string
+     */
+    private function setInlineCss($content)
+    {
+        $content = str_replace(['<!--', '-->'], '', $content);
+        $parser = new Emogrifier($content);
+        return $parser->emogrify();
+    }
+
+    /**
+     * The method looks for images in html and replaces the src attribute to base64 hash.
+     * @param string $content
+     * @return string
+     */
+    private function setInlineImages($content)
+    {
+        $dom = new DOMDocument();
+        $dom->loadHTML($content);
+        $xpath = new DOMXPath($dom);
+        $xpath->registerNamespace("xml", "http://www.w3.org/1999/xhtml");
+
+        $images = $xpath->query("//img");
+        foreach ($images as $img) { /** @var \DOMNode $img  */
+            $attrImage = $img->getAttribute('src');
+            $pi = pathinfo($attrImage);
+            $image = $this->getOutputDir() . '/' . $pi['basename'];
+            $imageData = base64_encode(file_get_contents($image));
+            $src = 'data: ' . mime_content_type($image) . ';base64,' . $imageData;
+            $content = str_replace($attrImage, $src, $content);
+        }
+        unset($dom, $xpath, $images, $imageData);
+        return $content;
+    }
+
+    /**
+     * The method takes from html body content only.
+     * @param string $content
+     * @return string
+     */
+    private function setOnlyContent($content)
+    {
+        $dom = new DOMDocument();
+        $dom->loadHTML($content);
+        $xpath = new DOMXPath($dom);
+        $xpath->registerNamespace("xml", "http://www.w3.org/1999/xhtml");
+
+        $html = '';
+        $body = $xpath->query("//body")->item(0);
+        foreach($body->childNodes as $node) {
+            $html .= $dom->saveHTML($node);
+        }
+        unset($dom, $xpath, $body, $content);
+        return trim($html);
+    }
+}