Skip to content

Commit

Permalink
fix: ENV
Browse files Browse the repository at this point in the history
  • Loading branch information
invalidid56 committed Nov 10, 2023
1 parent c0b800b commit cb2cb05
Show file tree
Hide file tree
Showing 7 changed files with 246 additions and 0 deletions.
61 changes: 61 additions & 0 deletions crawler/crawler/get_google_news.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
from apscheduler.schedulers.background import BackgroundScheduler
from apscheduler.jobstores.base import JobLookupError
import requests
import datetime
import maya
import feedparser

import newsDB


class Crawler():
def __init__(self):
print ('srart crawling')
self.scheduler = BackgroundScheduler(job_defaults={'max_instances': 10, 'coalesce': False})
self.scheduler.start()
self.dbManager = newsDB.newsDB()

def __del__(self):
self.stop()

def exec(self, country, keyword):
print ('Google News Cron Start: ' + datetime.datetime.now().strftime("%m/%d/%Y, %H:%M:%S"))
URL = 'https://news.google.com/rss/search?q={}+when:1d'.format(keyword)
if country == 'en':
URL += '&hl=en-NG&gl=NG&ceid=NG:en'
elif country == 'ko':
URL += '&hl=ko&gl=KR&ceid=KR:ko'

try:
res = requests.get(URL)
if res.status_code == 200:
datas = feedparser.parse(res.text).entries
for data in datas:
data['published'] = maya.parse(data.published).datetime(to_timezone="Asia/Seoul", naive=True)
data['source'] = data.source.title
self.dbManager.queryInsertGoogleNewsTable(data)
else:
print ('Google 검색 에러')
except requests.exceptions.RequestException as err:
print ('Error Requests: {}'.format(err))

def run(self, mode, country, keyword):
print ("실행!")
self.dbManager.queryCreateGoogleNewsTable(keyword)
self.dbManager.queryCreateKeywordTable()
self.dbManager.queryInsertKeywordTable({
'keyword': keyword,
'country': country
})
if mode == 'once':
self.scheduler.add_job(self.exec, args=[country, keyword])
elif mode == 'interval':
self.scheduler.add_job(self.exec, 'interval', seconds=10, args=[country, keyword])
elif mode == 'cron':
self.scheduler.add_job(self.exec, 'cron', second='*/10', args=[country, keyword])

def stop(self):
try: self.scheduler.shutdown()
except: pass
try: self.dbManager.close()
except: pass
Binary file added crawler/crawler/googleNews.db
Binary file not shown.
Empty file added crawler/crawler/google_news
Empty file.
Empty file added crawler/crawler/google_news.db
Empty file.
21 changes: 21 additions & 0 deletions crawler/crawler/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
import time
import argparse

import get_google_news

def main():
parser = argparse.ArgumentParser()
parser.add_argument('mode', type=str, choices=['once','interval','cron'], default='once', help="Choose how you want to run the code")
parser.add_argument('--country', type=str, required=False, default='en', choices=['en','ko'], help="Which country will you search for news?")
parser.add_argument('--keyword', type=str, required=False, default='all', help="Enter keywords to crawl")
args = parser.parse_args()
try:
Crawler = get_google_news.Crawler()
Crawler.run(args.mode, args.country, args.keyword)
while True:
time.sleep(1)
except KeyboardInterrupt:
Crawler.stop()

if __name__=="__main__":
main()
84 changes: 84 additions & 0 deletions crawler/crawler/newsDB.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
import sqlite3

class newsDB():
def __init__(self):
print ("start DB Manager")
self.DBName = 'googleNews.db'
self.db = sqlite3.connect(self.DBName, check_same_thread=False)
self.db.row_factory = sqlite3.Row
self.googleNews_table = 'googleNews'
self.keyword_table = 'keyword'
self.googleNews_columns = {
'published': 'text',
'source': 'text PRIMARY KEY',
'title': 'text',
'link': 'text',
}
self.keyword_columns = {
'keyword': 'text PRIMARY KEY',
'country': 'text',
}

def __del__(self):
self.stop()

def stop(self):
try: self.db.close()
except: pass
#주어진 키워드에 맞는 Google News 테이블을 생성합니다.
def queryCreateGoogleNewsTable(self, keyword):
self.googleNews_table = 'googleNews_' + keyword.lower()
cursor = self.db.cursor()
colum_info = ",".join(col_name + ' ' + col_type for col_name, col_type in self.googleNews_columns.items())
query = "CREATE TABLE IF NOT EXISTS {} ({})".format(self.googleNews_table, colum_info)
cursor.execute(query)
self.db.commit()
#주어진 키워드에 맞는 Google News 테이블에 값을 삽입합니다.
def queryInsertGoogleNewsTable(self, values):
cursor = self.db.cursor()
colums = ','.join(self.googleNews_columns.keys())
values = '","'.join(str(values[col_name]).replace('"',"'") for col_name in self.googleNews_columns.keys())
query = 'INSERT OR IGNORE INTO {} ({}) VALUES ("{}")'.format(self.googleNews_table, colums, values)
cursor.execute(query)
self.db.commit()
#주어진 키워드에 맞는 Google News 테이블을 삭제합니다.
def queryDeleteAllGoogleNewsTable(self, keyword):
googleNews_table = 'googleNews_' + keyword.lower()
query = "DROP TABLE IF EXISTS {}".format(googleNews_table)
cursor = self.db.cursor()
cursor.execute(query)
self.db.commit()
#주어진 키워드에 맞는 Google News 테이블의 모든 값을 가져옵니다.
def querySelectAllGoogleNewsTable(self, keyword):
googleNews_table = 'googleNews_' + keyword.lower()
query = "SELECT * FROM {}".format(googleNews_table)
cursor = self.db.cursor()
cursor.execute(query)
return cursor.fetchall()
#키워드 테이블을 생성합니다.
def queryCreateKeywordTable(self):
cursor = self.db.cursor()
colum_info = ",".join(col_name + ' ' + col_type for col_name, col_type in self.keyword_columns.items())
query = "CREATE TABLE IF NOT EXISTS {} ({})".format(self.keyword_table, colum_info)
cursor.execute(query)
self.db.commit()
#키워드 테이블에 값을 삽입합니다.
def queryInsertKeywordTable(self, values):
cursor = self.db.cursor()
colums = ','.join(self.keyword_columns.keys())
values = '","'.join(str(values[col_name]).replace('"',"'") for col_name in self.keyword_columns.keys())
query = 'INSERT OR IGNORE INTO {} ({}) VALUES ("{}")'.format(self.keyword_table, colums, values)
cursor.execute(query)
self.db.commit()
#키워드 테이블의 값을 삭제합니다.
def queryDeleteKeywordTable(self, keyword):
cursor = self.db.cursor()
query = "DELETE FROM {} WHERE KEYWORD='{}'".format(self.keyword_table, keyword)
cursor.execute(query)
self.db.commit()
#키워드 테이블의 모든 값을 가져옵니다.
def querySelectAllKeywordTable(self):
query = "SELECT * FROM {}".format(self.keyword_table)
cursor = self.db.cursor()
cursor.execute(query)
return cursor.fetchall()
Loading

0 comments on commit cb2cb05

Please sign in to comment.