If you already installed ptt.py with pip, just use command ptt to start crawling:
# $ ptt -b <board name> -i <start_page> <end_page> -f <output format> # dump article jsons in the latest page of board "gossiping"# -1 means last page and -2 means second last page...
$ ptt -b gossiping -i -1 -1 -f json
or you can run library module as script:
$ python -m ptt -b gossiping -i -1 -1 -f json
If you want to use it directly, just cd to repo dir and run script ptt.py:
$ ptt -h
usage: ptt [-h] -b BOARD [-d DIR] [-f {json,csv}]
(-a ID | -i START END START END)
ptt.py
optional arguments:
-h, --help show this help message and exit
-b BOARD, --board BOARD
board name
-d DIR, --destination DIR
destination
-f {json,csv}, --format {json,csv}
output format (default: json)
-a ID, --aid ID article id
-i START END START END, --index START END START END
start/end index
Use as Library
from ptt import Board
latest_page = Board('gossiping')
for summary in latest_page:
if summary.isremoved:
continue
article = summary.read()
print(article.dump_json())
# iterate all article summaries from specified boardfor summary in Board('gossiping'):
print(summary.title, summary.url)
# read article from summaryifnot summary.isremoved:
article = summary.read()
attribute
Attr Name
Type
Note
Example
title
str
title of Article
'[協尋] 12月18日 晚上9點前後 高雄市明誠路 鼎'
category
str
string in syntax '[' and ']' of title
'協尋'
url
str
url of the Article without ptt domain name
'/bbs/Gossiping/M.1513683634.A.2F5.html'
board
str
board name of Article
'Gossiping'
aid
str
Article ID
'M.1513683634.A.2F5'
date
str
string of Article date
'7/24'
author
str
string of Article author (only author id)
'jokerndmc'
score
str
string of score or '爆' for score>99 or 'X' for score<0
'20'
mark
str
Article mark
'M'
removeinfo
str
remove infomation written in title
'(本文已被刪除) [SamuraiJack]'
isreply
bool
True if 'Re:' in title else False
True
isforward
bool
True if 'Fw:' in title else False
False
isremoved
bool
True if Article has been removed else False
True
API
API Name
Return Type
Note
read()
ArticlePage
return corresponding ArticlePage if it is not removed
class ArticleListPage
example
# get page-20 of specified board
lst_page = ArticleListPage.from_board('gossiping', 20)
# you can also use the alias "Board" instead
lst_page = Board('gossiping', 20)
# get the newest page of specified board by given no page index
lst_page = Board('gossiping')
# iterate all article summaries of a article list pagefor summary in lst_page:
print(summary)
# get first article summary
summary = lst_page.get_article_summary(0)
attribute
Attr Name
Type
Note
Example
board
str
board name of this ArticleListPage
'Gossiping'
idx
int
index of ArticleListPage
29585
related_urls['board']
str
latest article list page url of the board
'/bbs/Gossiping/index.html'
related_urls['man']
str
精華區 url of the board
'/man/Gossiping/index.html'
related_urls['previous']
str
preivious article list page url (None if not exists)
'/man/Gossiping/index29584.html'
related_urls['next']
str
next article list page url (None if not exists)
None
related_urls['oldest']
str
oldest article list page url
'/bbs/Gossiping/index1.html'
related_urls['newest']
str
newest article list page url
'/bbs/Gossiping/index.html'
previous
ArticleListPage
ArticleListPage of related_urls['previous']
next
ArticleListPage
ArticleListPage of related_urls['next']
oldest
ArticleListPage
ArticleListPage of related_urls['oldest']
newest
ArticleListPage
ArticleListPage of related_urls['newest']
article_summaries
generator of ArticleSummary
ArticleSummary generator of this ArticleListPage
API
API Name
Return Type
Note
get_article_summary(index)
ArticleSummary
get AritcleSummary by given index
class ArticlePage: alias to Article
example
# get article by board name and aid
article = ArticlePage.from_board_aid('gossiping', 'M.1513683634.A.2F5')
# you can also use the alias "Article" instead
article = Article.from_board_aid('gossiping', 'M.1513683634.A.2F5')
# dump json string with aid and author
string = article.dump_json('aid', 'author')
print(string)
attribute
Attr Name
Type
Note
Example
title
str
title of Article
'[協尋] 12月19日 晚上9點前後 高雄市明誠路 鼎'
category
str
string in '[' and ']' of title
'協尋'
board
str
board name of Article
'Gossiping'
aid
str
Article ID
'M.1513683634.A.2F5'
date
str
string of Article date
'Tue Feb 16 20:15:23 2016'
datetime
datetime
datetime format of date
datetime.datetime(2017, 12, 19, 19, 40, 31)
author
str
string of Article author
'jokerndmc (小人物)'
ip
str
author's ip
'115.82.209.7'
signature
str
signature string of the author
pushes
Pushes
Pushes is a class which collects all pushes in article
content
str
main content of article using html format
isreply
bool
True if 'Re:' in title else False
False
isforward
bool
True if 'Fw:' in title else False
False
API
API Name
Return Type
Note
dump_json(*attrs, flat=False)
str
dump json string with specified attrs
dump_csv(*attrs, delimiter=',')
str
dump csv string with specified attrs
class Pushes
example
# get simple expression (list of dictionary) of a Pushes>>> pushes.simple_expression
[...
{'content': '幫高調,雖然機會不高但還是希望可以找到!',
'ipdatetime': '12/19 22:22',
'type': '推',
'user': 'aquami'},
{'content': '住附近 突然發現有鼎吉路',
'ipdatetime': '12/21 01:34',
'type': '推',
'user': 'sh981215'},
...
]
attribute
Attr Name
Type
Note
Example
article
ArticlePage
ArticlePage of these pushes
msgs
list
list of Msg(self-defined namedtuple)
count['all']
int
total msg in Pushes
38
count['score']
int
positive msg count - negative msg count
23
count['like']
int
positive msg count
26
count['boo']
int
negative msg count
3
count['neutral']
int
neutral msg count
9
simple_expression
list
list of dictionaries which are used to model every Msg
The Tidelift Subscription provides access to a continuously curated stream of human-researched and maintainer-verified data on open source packages and their licenses, releases, vulnerabilities, and development practices.