# 구글 기사 크롤링
import requests
from bs4 import BeautifulSoup as bs
import datetime
from datetime import timedelta
import pandas as pd
import openpyxl
today = datetime.date.today()
yesterday = (today-timedelta(days=1))
cd_max = str(today.month)+'/'+today.strftime('%d')+'/'+str(today.year)
cd_min = str(yesterday.month)+'/'+yesterday.strftime('%d')+'/'+str(yesterday.year)
tbs = f'cdr:1,cd_min:{cd_min},cd_max:{cd_max}'
keyword=['노르웨이수산물위원회', '노르웨이 연어','노르웨이 고등어','관세']
data=[]
for i in range (0,len(keyword),1):
for j in range (0,30,10):
params = {'q' : keyword[i] , 'hl' : 'ko', 'tbm' : 'nws', 'tbs' : tbs, 'start': {j}}
header = {'user-agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36'}
cookie = {'CONSENT' : 'YES'}
url = 'https://www.google.com/search?'
res = requests.get(url, params = params, headers = header, cookies = cookie)
soup = bs(res.text, 'html.parser')
try :
news_file = soup.find_all('div', 'SoaBEf')
for news in news_file:
title_name = news.find('div',{'class':'mCBkyc ynAwRc MBeuO nDgy9d'}).text
press_name = news.find('div',{'class':'CEMjEf NUnG9d'}).text
url = news.find('a',{'class':'WlydOe'})['href']
data.append({
'keyword' : keyword[i],
'press' : press_name,
'title' : title_name,
'url': url,
'date' : today.strftime('%b %d %Y')
})
except :
break
df = pd.DataFrame(data)
df = pd.DataFrame(data,columns=['keyword','press','title','url','date'])
df.to_excel(today.strftime("%d %B %Y")+' 구글 기사 크롤링.xlsx')
wb = openpyxl.load_workbook(today.strftime("%d %B %Y")+' 구글 기사 크롤링.xlsx')
ws = wb.active
ws.column_dimensions['B'].width = 15
ws.column_dimensions['C'].width = 22
ws.column_dimensions['D'].width = 68
wb.save(today.strftime("%d %B %Y")+' 구글 기사 크롤링.xlsx')
print('완료')
728x90
'Programing > Toy Project' 카테고리의 다른 글
Outlook 메일 전송 Python (0) | 2023.06.05 |
---|---|
엑셀 파일 요약 (0) | 2023.06.02 |
네이버 기사 키워드 크롤링 + 액셀 저장 (0) | 2023.02.05 |
댓글