-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
c0b800b
commit cb2cb05
Showing
7 changed files
with
246 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,61 @@ | ||
from apscheduler.schedulers.background import BackgroundScheduler | ||
from apscheduler.jobstores.base import JobLookupError | ||
import requests | ||
import datetime | ||
import maya | ||
import feedparser | ||
|
||
import newsDB | ||
|
||
|
||
class Crawler(): | ||
def __init__(self): | ||
print ('srart crawling') | ||
self.scheduler = BackgroundScheduler(job_defaults={'max_instances': 10, 'coalesce': False}) | ||
self.scheduler.start() | ||
self.dbManager = newsDB.newsDB() | ||
|
||
def __del__(self): | ||
self.stop() | ||
|
||
def exec(self, country, keyword): | ||
print ('Google News Cron Start: ' + datetime.datetime.now().strftime("%m/%d/%Y, %H:%M:%S")) | ||
URL = 'https://news.google.com/rss/search?q={}+when:1d'.format(keyword) | ||
if country == 'en': | ||
URL += '&hl=en-NG&gl=NG&ceid=NG:en' | ||
elif country == 'ko': | ||
URL += '&hl=ko&gl=KR&ceid=KR:ko' | ||
|
||
try: | ||
res = requests.get(URL) | ||
if res.status_code == 200: | ||
datas = feedparser.parse(res.text).entries | ||
for data in datas: | ||
data['published'] = maya.parse(data.published).datetime(to_timezone="Asia/Seoul", naive=True) | ||
data['source'] = data.source.title | ||
self.dbManager.queryInsertGoogleNewsTable(data) | ||
else: | ||
print ('Google 검색 에러') | ||
except requests.exceptions.RequestException as err: | ||
print ('Error Requests: {}'.format(err)) | ||
|
||
def run(self, mode, country, keyword): | ||
print ("실행!") | ||
self.dbManager.queryCreateGoogleNewsTable(keyword) | ||
self.dbManager.queryCreateKeywordTable() | ||
self.dbManager.queryInsertKeywordTable({ | ||
'keyword': keyword, | ||
'country': country | ||
}) | ||
if mode == 'once': | ||
self.scheduler.add_job(self.exec, args=[country, keyword]) | ||
elif mode == 'interval': | ||
self.scheduler.add_job(self.exec, 'interval', seconds=10, args=[country, keyword]) | ||
elif mode == 'cron': | ||
self.scheduler.add_job(self.exec, 'cron', second='*/10', args=[country, keyword]) | ||
|
||
def stop(self): | ||
try: self.scheduler.shutdown() | ||
except: pass | ||
try: self.dbManager.close() | ||
except: pass |
Binary file not shown.
Empty file.
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
import time | ||
import argparse | ||
|
||
import get_google_news | ||
|
||
def main(): | ||
parser = argparse.ArgumentParser() | ||
parser.add_argument('mode', type=str, choices=['once','interval','cron'], default='once', help="Choose how you want to run the code") | ||
parser.add_argument('--country', type=str, required=False, default='en', choices=['en','ko'], help="Which country will you search for news?") | ||
parser.add_argument('--keyword', type=str, required=False, default='all', help="Enter keywords to crawl") | ||
args = parser.parse_args() | ||
try: | ||
Crawler = get_google_news.Crawler() | ||
Crawler.run(args.mode, args.country, args.keyword) | ||
while True: | ||
time.sleep(1) | ||
except KeyboardInterrupt: | ||
Crawler.stop() | ||
|
||
if __name__=="__main__": | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,84 @@ | ||
import sqlite3 | ||
|
||
class newsDB(): | ||
def __init__(self): | ||
print ("start DB Manager") | ||
self.DBName = 'googleNews.db' | ||
self.db = sqlite3.connect(self.DBName, check_same_thread=False) | ||
self.db.row_factory = sqlite3.Row | ||
self.googleNews_table = 'googleNews' | ||
self.keyword_table = 'keyword' | ||
self.googleNews_columns = { | ||
'published': 'text', | ||
'source': 'text PRIMARY KEY', | ||
'title': 'text', | ||
'link': 'text', | ||
} | ||
self.keyword_columns = { | ||
'keyword': 'text PRIMARY KEY', | ||
'country': 'text', | ||
} | ||
|
||
def __del__(self): | ||
self.stop() | ||
|
||
def stop(self): | ||
try: self.db.close() | ||
except: pass | ||
#주어진 키워드에 맞는 Google News 테이블을 생성합니다. | ||
def queryCreateGoogleNewsTable(self, keyword): | ||
self.googleNews_table = 'googleNews_' + keyword.lower() | ||
cursor = self.db.cursor() | ||
colum_info = ",".join(col_name + ' ' + col_type for col_name, col_type in self.googleNews_columns.items()) | ||
query = "CREATE TABLE IF NOT EXISTS {} ({})".format(self.googleNews_table, colum_info) | ||
cursor.execute(query) | ||
self.db.commit() | ||
#주어진 키워드에 맞는 Google News 테이블에 값을 삽입합니다. | ||
def queryInsertGoogleNewsTable(self, values): | ||
cursor = self.db.cursor() | ||
colums = ','.join(self.googleNews_columns.keys()) | ||
values = '","'.join(str(values[col_name]).replace('"',"'") for col_name in self.googleNews_columns.keys()) | ||
query = 'INSERT OR IGNORE INTO {} ({}) VALUES ("{}")'.format(self.googleNews_table, colums, values) | ||
cursor.execute(query) | ||
self.db.commit() | ||
#주어진 키워드에 맞는 Google News 테이블을 삭제합니다. | ||
def queryDeleteAllGoogleNewsTable(self, keyword): | ||
googleNews_table = 'googleNews_' + keyword.lower() | ||
query = "DROP TABLE IF EXISTS {}".format(googleNews_table) | ||
cursor = self.db.cursor() | ||
cursor.execute(query) | ||
self.db.commit() | ||
#주어진 키워드에 맞는 Google News 테이블의 모든 값을 가져옵니다. | ||
def querySelectAllGoogleNewsTable(self, keyword): | ||
googleNews_table = 'googleNews_' + keyword.lower() | ||
query = "SELECT * FROM {}".format(googleNews_table) | ||
cursor = self.db.cursor() | ||
cursor.execute(query) | ||
return cursor.fetchall() | ||
#키워드 테이블을 생성합니다. | ||
def queryCreateKeywordTable(self): | ||
cursor = self.db.cursor() | ||
colum_info = ",".join(col_name + ' ' + col_type for col_name, col_type in self.keyword_columns.items()) | ||
query = "CREATE TABLE IF NOT EXISTS {} ({})".format(self.keyword_table, colum_info) | ||
cursor.execute(query) | ||
self.db.commit() | ||
#키워드 테이블에 값을 삽입합니다. | ||
def queryInsertKeywordTable(self, values): | ||
cursor = self.db.cursor() | ||
colums = ','.join(self.keyword_columns.keys()) | ||
values = '","'.join(str(values[col_name]).replace('"',"'") for col_name in self.keyword_columns.keys()) | ||
query = 'INSERT OR IGNORE INTO {} ({}) VALUES ("{}")'.format(self.keyword_table, colums, values) | ||
cursor.execute(query) | ||
self.db.commit() | ||
#키워드 테이블의 값을 삭제합니다. | ||
def queryDeleteKeywordTable(self, keyword): | ||
cursor = self.db.cursor() | ||
query = "DELETE FROM {} WHERE KEYWORD='{}'".format(self.keyword_table, keyword) | ||
cursor.execute(query) | ||
self.db.commit() | ||
#키워드 테이블의 모든 값을 가져옵니다. | ||
def querySelectAllKeywordTable(self): | ||
query = "SELECT * FROM {}".format(self.keyword_table) | ||
cursor = self.db.cursor() | ||
cursor.execute(query) | ||
return cursor.fetchall() |
Oops, something went wrong.