AioScrape

A scraping library on top of aiohttp and parsechain. Note that this is alpha software.

Installation

pip install aioscrape

Usage

from aioscrape import run, fetch, settings
from aioscrape.middleware import last_fetch, make_filecache
from aioscrape.utils import SOME_HEADERS # To not look like a bot

from urllib.parse import urljoin
from parsechain import C
from funcy import lcat, lconcat


def main():
    # Settings are scoped and can be redefined later with another "with"
    cache = make_filecache('.fcache')
    with settings(headers=SOME_HEADERS, middleware=[cache, last_fetch]):
        print(run(scrape_all()))


async def scrape_all():
    # All the settings in scope like headers and middleware are applied to fetch()
    start_page = await fetch(START_URL)

    # AioScrape integrates with parsechain to make extracting a breeze
    urls = start_page.css('.pagingLinks a').attrs('href')
    list_urls = [urljoin(start_page.url, page_url) for page_url in urls]

    # Using asyncio.wait() and friends to run requests in parallel
    list_pages = [start_page] + await wait_all(map(fetch, list_urls))

    # Scrape articles
    result = lcat(await wait_all(map(scrape_articles, list_pages)))
    write_to_csv('export.csv', result)


async def scrape_articles(list_page):
    urls = list_page.css('#headlines .titleLink').attrs('href')
    abs_urls = [urljoin(list_page.url, url) for url in urls]
    return await wait_all(map(scrape_article, abs_urls))


async def scrape_article(url):
    resp = await fetch(url)
    return resp.root.multi({
        'url': C.const(resp.url),
        'title': C.microdata('headline').first,
        'date': C.microdata('datePublished').first,
        'text': C.microdata('articleBody').first,
        'contacts': C.css('.sidebars .contact p')
                     .map(C.inner_html + html_to_text) + lconcat + ''.join,
    })


if __name__ == '__main__':
    main()

TODO

Response.follow()
non-GET requests
work with forms

aiofilecache
Release 0.0.1

Release 0.0.1

0.0.1

Documentation

AioScrape

Installation

Usage

TODO

Stats

Development practices

Releases

Contributors

aiofilecache Release 0.0.1

Release 0.0.1 Toggle Dropdown 0.0.1

Documentation

AioScrape

Installation

Usage

TODO

Stats

Development practices

Releases

Contributors

aiofilecache
Release 0.0.1

Release 0.0.1

0.0.1