Py Collector

Py Collector is a simple, reliable, DB agnostic framework for consistently collecting data from any source.

It utilizes two main components, the Collector and the Scheduler.

Checkout the imports to run the examples here

Code Examples

Data	Code
Collect Weather Data into SQL Alchemy	class Weather(Collector): start_time = datetime.now()#to start immediatly scheduler = Scheduler(days=1/24, count=1, separator=1, start_time = start_time) def upload(self): ''' Runs on schedule, and will only run if is_new returns true''' r = requests.get('https://api.weather.gov/gridpoints/FWD/59,23/forecast') data = r.json()['properties']['periods'] points = [] for i in data: data_point = WeatherDataPoint( start_date=datetime.fromisoformat(i['startTime']), end_date=datetime.fromisoformat(i['endTime']), temp=i['temperature'], windspeed=i['windSpeed'] ) points.append(data_point) session.add_all(points) session.commit() def is_new(self): '''Evaluates if the data should be uploaded, if it only returns True, then it will just upload on schedule.''' return True
Collect Energy Data into a CSV every minute	class Energy(Collector): start_time = datetime.now()#to start immediatly scheduler = Scheduler(minutes=1, #every minute count=2, #try 3 times separator=2, #two seconds between tries start_time = start_time) first_run = True last_update = None def upload(self): ''' Runs on schedule, and will only run if is_new returns true''' df = pd.read_html(self.get_site.text)[0] title = 'ercot_dam_clearing_'+self.last_update.strftime('%m_%d_%Y')+'.csv' file = open(title,'w') df.to_csv(file) def is_new(self): '''Evaluates if the data should be uploaded, if it only returns True, then it will just upload on schedule.''' if self.first_run: #first run load whatever is there self.first_run = False self.last_update = self.get_last_changed() return True else: #if it has changed since we last updated, download last_changed = self.get_last_changed() if self.last_update < last_changed: self.last_update = last_changed return True else: return False def get_site(self): return requests.get('http://www.ercot.com/content/cdr/html/actual_loads_of_forecast_zones') def soup(self): r = self.get_site() return BeautifulSoup(r.text,'html.parser') def get_last_changed(self): soup = self.soup() last_change = soup.find('div',attrs={'class':'schedTime rightAlign'}) last_change = last_change.text.split('Time:')[1].lstrip() return datetime.strptime(last_change,'%b %d, %Y %H:%M')
Collect TikTok Data into a MongoDB every day	class TikTokUser(MongoModel): username = fields.CharField() followers = fields.CharField() likes = fields.CharField() following = fields.CharField() class Meta: write_concern = WriteConcern(j=True) connection_alias = 'my-app' class TikTok(Collector): start_time = datetime.now() scheduler = Scheduler(days=1, #every date count=1, #try 1 times separator=1, #not applicable start_time = start_time) #start now def upload(self): ''' Runs on schedule, and will only run if is_new returns true''' data = self.user_stats('gordonramsayofficial') user = TikTokUser.from_document(data) user.save() def is_new(self): '''Evaluates if the data should be uploaded, if it only returns True, then it will just upload on schedule.''' return True def user_stats(self,user ='gordonramsayofficial'): r = self.user_raw(user) soup = BeautifulSoup(r.text,'html.parser') info = soup.find('h2',attrs={'class':'count-infos'}) return { 'following':info.find('strong',attrs={'title':'Following'}).text, 'followers':info.find('strong',attrs={'title':'Followers'}).text, 'likes':info.find('strong',attrs={'title':'Likes'}).text, 'username':user } def user_raw(self, user): headers={ "authority": "m.tiktok.com", "User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X x.y; rv:42.0) Gecko/20100101 Firefox/42.0", "method": "GET", "scheme": "https", "accept": "application/json, text/plain, /", "accept-encoding": 'gzip, deflate, utf-8', "accept-language": "en-US,en;q=0.9", "sec-fetch-dest": "empty", "sec-fetch-mode": "cors", "sec-fetch-site": "same-site", "sec-gpc": "1" } return requests.get(f'https://www.tiktok.com/@{user}?lang=en',headers=headers)

Data

Code

Collect Weather Data into SQL Alchemy

class Weather(Collector):
    start_time = datetime.now()#to start immediatly
    scheduler = Scheduler(days=1/24, 
                        count=1, 
                        separator=1,
                        start_time = start_time)
    
    def upload(self):
        ''' Runs on schedule, and will only run if is_new 
            returns true'''
        r = requests.get('https://api.weather.gov/gridpoints/FWD/59,23/forecast')
        data = r.json()['properties']['periods']
        points = []
        for i in data:
            data_point = WeatherDataPoint(
                start_date=datetime.fromisoformat(i['startTime']),
                end_date=datetime.fromisoformat(i['endTime']),
                temp=i['temperature'],
                windspeed=i['windSpeed']
            )
            points.append(data_point)

        session.add_all(points)
        session.commit()

    def is_new(self):
        '''Evaluates if the data should be uploaded,
        if it only returns True, then it will just upload 
        on schedule.'''
        return True

Collect Energy Data into a CSV every minute

class Energy(Collector):
    start_time = datetime.now()#to start immediatly

    scheduler = Scheduler(minutes=1, #every minute
                        count=2, #try 3 times
                        separator=2, #two seconds between tries
                        start_time = start_time)
    first_run = True
    last_update = None

    def upload(self):
        ''' Runs on schedule, and will only run if is_new 
            returns true'''
        df = pd.read_html(self.get_site.text)[0]
        title = 'ercot_dam_clearing_'+self.last_update.strftime('%m_%d_%Y')+'.csv'
        file = open(title,'w')
        df.to_csv(file)

    def is_new(self):
        '''Evaluates if the data should be uploaded,
        if it only returns True, then it will just upload 
        on schedule.'''
        if self.first_run:
            #first run load whatever is there
            self.first_run = False
            self.last_update = self.get_last_changed()
            return True
        else:
            #if it has changed since we last updated, download
            last_changed = self.get_last_changed()
            if self.last_update < last_changed:
                self.last_update = last_changed
                return True
            else:
                return False

    def get_site(self):
        return requests.get('http://www.ercot.com/content/cdr/html/actual_loads_of_forecast_zones')

    def soup(self):
        r = self.get_site()
        return BeautifulSoup(r.text,'html.parser')

    def get_last_changed(self):
        soup = self.soup()
        last_change = soup.find('div',attrs={'class':'schedTime rightAlign'})
        last_change = last_change.text.split('Time:')[1].lstrip()
        return datetime.strptime(last_change,'%b %d, %Y %H:%M')

Collect TikTok Data into a MongoDB every day

class TikTokUser(MongoModel):
    username = fields.CharField()
    followers = fields.CharField()
    likes = fields.CharField()
    following = fields.CharField()

    class Meta:
        write_concern = WriteConcern(j=True)
        connection_alias = 'my-app'

class TikTok(Collector):
    start_time = datetime.now() 

    scheduler = Scheduler(days=1, #every date
                        count=1, #try 1 times
                        separator=1, #not applicable
                        start_time = start_time) #start now

    def upload(self):
        ''' Runs on schedule, and will only run if is_new 
            returns true'''

        data = self.user_stats('gordonramsayofficial')
        user = TikTokUser.from_document(data)
        user.save()

    def is_new(self):
        '''Evaluates if the data should be uploaded,
        if it only returns True, then it will just upload 
        on schedule.'''
        return True
    
    def user_stats(self,user ='gordonramsayofficial'):
            r = self.user_raw(user)
            soup = BeautifulSoup(r.text,'html.parser')
            info = soup.find('h2',attrs={'class':'count-infos'})
            return {
                'following':info.find('strong',attrs={'title':'Following'}).text,
                'followers':info.find('strong',attrs={'title':'Followers'}).text,
                'likes':info.find('strong',attrs={'title':'Likes'}).text, 
                'username':user
            }

    def user_raw(self, user):
        headers={
            "authority": "m.tiktok.com",
            "User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X x.y; rv:42.0) Gecko/20100101 Firefox/42.0",
            "method": "GET",
            "scheme": "https",
            "accept": "application/json, text/plain, */*",
            "accept-encoding": 'gzip, deflate, utf-8',
            "accept-language": "en-US,en;q=0.9",
            "sec-fetch-dest": "empty",
            "sec-fetch-mode": "cors",
            "sec-fetch-site": "same-site",
            "sec-gpc": "1"
            }
        return requests.get(f'https://www.tiktok.com/@{user}?lang=en',headers=headers)

py-collector
Release 0.0.25

Release 0.0.25

0.0.25

0.0.24

0.0.3

0.0.23

0.0.22

0.0.21

0.0.1

Documentation

Py Collector

Code Examples

Stats

Releases

Contributors

py-collector Release 0.0.25

Release 0.0.25 Toggle Dropdown 0.0.25 0.0.24 0.0.3 0.0.23 0.0.22 0.0.21 0.0.1

Documentation

Py Collector

Code Examples

Stats

Releases

Contributors

py-collector
Release 0.0.25

Release 0.0.25

0.0.25

0.0.24

0.0.3

0.0.23

0.0.22

0.0.21

0.0.1