ixnode/php-web-crawler

PHP Web Crawler - This PHP class allows you to crawl recursively a given html page (or a given html file) and collect some data from it.


Keywords
php, json, web, html, array, scraper, spider, crawler, recursive
License
MIT

Documentation

PHP Web Crawler

Release PHP PHPStan PHPUnit PHPCS PHPMD Rector - Instant Upgrades and Automated Refactoring LICENSE

This PHP class allows you to crawl recursively a given html page (or a given html file) and collect some data from it. Simply define the url (or a html file) and a set of xpath expressions which should map with the output data object. The final representation will be a php array which can be easily converted into the json format for further processing.

1. Installation

composer require ixnode/php-web-crawler
vendor/bin/php-web-crawler -V
php-web-crawler 0.1.0 (02-24-2024 14:46:26) - Björn Hempel <bjoern@hempel.li>

2. Usage

2.1 PHP Code

use Ixnode\PhpWebCrawler\Output\Field;
use Ixnode\PhpWebCrawler\Source\Raw;
use Ixnode\PhpWebCrawler\Value\Text;
use Ixnode\PhpWebCrawler\Value\XpathTextNode;

$rawHtml = <<<HTML
<html>
    <head>
        <title>Test Page</title>
    </head>
    <body>
        <h1>Test Title</h1>
        <p>Test Paragraph</p>
    </body>
</html>
HTML;

$html = new Raw(
    $rawHtml,
    new Field('version', new Text('1.0.0')),
    new Field('title', new XpathTextNode('//h1')),
    new Field('paragraph', new XpathTextNode('//p'))
);

$html->parse()->getJsonStringFormatted();
// See below

2.2 JSON result

{
    "version": "1.0.0",
    "title": "Test Title",
    "paragraph": "Test Paragraph"
}

3. Advanced usage

3.1 Group

PHP Code

use Ixnode\PhpWebCrawler\Output\Field;
use Ixnode\PhpWebCrawler\Output\Group;
use Ixnode\PhpWebCrawler\Source\Raw;
use Ixnode\PhpWebCrawler\Value\XpathTextNode;

$rawHtml = <<<HTML
<html>
    <head>
        <title>Test Page</title>
    </head>
    <body>
        <h1>Test Title</h1>
        <p class="paragraph-1">Test Paragraph 1</p>
        <p class="paragraph-2">Test Paragraph 2</p>
    </body>
</html>
HTML;

$html = new Raw(
    $rawHtml,
    new Field('title', new XpathTextNode('/html/head/title')),
    new Group(
        'content',
        new Group(
            'header',
            new Field('h1', new XpathTextNode('/html/body//h1')),
        ),
        new Group(
            'text',
            new Field('p1', new XpathTextNode('/html/body//p[@class="paragraph-1"]')),
            new Field('p2', new XpathTextNode('/html/body//p[@class="paragraph-2"]')),
        )
    )
);

$html->parse()->getJsonStringFormatted();
// See below

JSON result

{
  "title": "Test Page",
  "content": {
    "header": {
      "h1": "Test Title"
    },
    "text": {
      "p1": "Test Paragraph 1",
      "p2": "Test Paragraph 2"
    }
  }
}

3.2 XpathSection

PHP Code

use Ixnode\PhpWebCrawler\Output\Field;
use Ixnode\PhpWebCrawler\Output\Group;
use Ixnode\PhpWebCrawler\Source\Raw;
use Ixnode\PhpWebCrawler\Source\XpathSection;
use Ixnode\PhpWebCrawler\Value\XpathTextNode;

$rawHtml = <<<HTML
<html>
    <head>
        <title>Test Page</title>
    </head>
    <body>
        <div class="content">
            <h1>Test Title</h1>
            <p class="paragraph-1">Test Paragraph 1</p>
            <p class="paragraph-2">Test Paragraph 2</p>
        </div>
    </body>
</html>
HTML;

$html = new Raw(
    $rawHtml,
    new Field('title', new XpathTextNode('/html/head/title')),
    new Group(
        'content',
        new XpathSection(
            '/html/body//div[@class="content"]',
            new Group(
                'header',
                new Field('h1', new XpathTextNode('./h1')),
            ),
            new Group(
                'text',
                new Field('p1', new XpathTextNode('./p[@class="paragraph-1"]')),
                new Field('p2', new XpathTextNode('./p[@class="paragraph-2"]')),
            )
        )
    )
);

$html->parse()->getJsonStringFormatted();
// See below

JSON result

{
    "title": "Test Page",
    "content": {
        "header": {
            "h1": "Test Title"
        },
        "text": {
            "p1": "Test Paragraph 1",
            "p2": "Test Paragraph 2"
        }
    }
}

3.3 XpathSection (flat)

PHP Code

use Ixnode\PhpWebCrawler\Output\Field;
use Ixnode\PhpWebCrawler\Output\Group;
use Ixnode\PhpWebCrawler\Source\Raw;
use Ixnode\PhpWebCrawler\Source\XpathSections;
use Ixnode\PhpWebCrawler\Value\XpathTextNode;

$rawHtml = <<<HTML
<html>
    <head>
        <title>Test Page</title>
    </head>
    <body>
        <div class="content">
            <h1>Test Title</h1>
            <p class="paragraph-1">Test Paragraph 1</p>
            <p class="paragraph-2">Test Paragraph 2</p>
            <ul>
                <li>Test Item 1</li>
                <li>Test Item 2</li>
            </ul>
        </div>
    </body>
</html>
HTML;

$html = new Raw(
    $rawHtml,
    new Field('title', new XpathTextNode('/html/head/title')),
    new Group(
        'hits',
        new XpathSections(
            '/html/body//div[@class="content"]/ul',
            new XpathTextNode('./li/text()'),
        )
    )
);

$html->parse()->getJsonStringFormatted();
// See below

JSON result

{
    "title": "Test Page",
    "hits": [
        [
            "Test Item 1",
            "Test Item 2"
        ]
    ]
}

3.3 XpathSection (structured)

PHP Code

use Ixnode\PhpWebCrawler\Output\Field;
use Ixnode\PhpWebCrawler\Output\Group;
use Ixnode\PhpWebCrawler\Source\Raw;
use Ixnode\PhpWebCrawler\Source\XpathSections;
use Ixnode\PhpWebCrawler\Value\XpathTextNode;

$rawHtml = <<<HTML
<html>
    <head>
        <title>Test Page</title>
    </head>
    <body>
        <div class="content">
            <h1>Test Title</h1>
            <p class="paragraph-1">Test Paragraph 1</p>
            <p class="paragraph-2">Test Paragraph 2</p>
            <table>
                <tbody>
                    <tr>
                        <th>Caption 1</th>
                        <td>Cell 1</td>
                    </tr>
                    <tr>
                        <th>Caption 2</th>
                        <td>Cell 2</td>
                    </tr>
                </tbody>
            </table>
        </div>
    </body>
</html>
HTML;

$html = new Raw(
    $rawHtml,
    new Field('title', new XpathTextNode('/html/head/title')),
    new Group(
        'hits',
        new XpathSections(
            '/html/body//div[@class="content"]/table/tbody/tr',
            new Field('caption', new XpathTextNode('./th/text()')),
            new Field('content', new XpathTextNode('./td/text()')),
        )
    )
);

$html->parse()->getJsonStringFormatted();
// See below

JSON result

{
    "title": "Test Page",
    "hits": [
        {
            "caption": "Caption 1",
            "content": "Cell 1"
        },
        {
            "caption": "Caption 2",
            "content": "Cell 2"
        }
    ]
}

4. More examples

5. Development

git clone git@github.com:ixnode/php-web-crawler.git && cd php-web-crawler
composer install
composer test

6. License

This library is licensed under the MIT License - see the LICENSE.md file for details.