todlong2 / news_scraper.py
cwpkd's picture
Create news_scraper.py
ceb72a8 verified
import requests
from bs4 import BeautifulSoup
from typing import List, Dict
import time
from config import YAHOO_FINANCE_NEWS_URL, HEADERS, MAX_NEWS_ITEMS
class YahooFinanceNewsScraper:
def __init__(self):
self.base_url = YAHOO_FINANCE_NEWS_URL
self.headers = HEADERS
def scrape_news(self, query: str = "", max_items: int = MAX_NEWS_ITEMS) -> List[Dict]:
"""
ดึงข่าวจาก Yahoo Finance
Args:
query: คำค้นหา (ถ้าไม่ระบุจะดึงข่าวหน้าแรก)
max_items: จำนวนข่าวที่ต้องการ
Returns:
List of dictionaries containing news data
"""
try:
# สร้าง URL
if query:
search_url = f"https://finance.yahoo.com/search?q={query.replace(' ', '+')}"
else:
search_url = self.base_url
# ส่ง request
response = requests.get(search_url, headers=self.headers, timeout=10)
response.raise_for_status()
# Parse HTML
soup = BeautifulSoup(response.content, 'html.parser')
news_items = []
# ค้นหา news items (Yahoo Finance ใช้ structure ต่างๆ)
# ลอง selector หลายแบบ
article_containers = (
soup.find_all('div', class_='Ov(h)') or
soup.find_all('div', class_='js-stream-content') or
soup.find_all('li', class_='js-stream-content') or
soup.find_all('h3')
)
for item in article_containers[:max_items]:
try:
# หา title
title_elem = item.find('a') or item.find('h3')
if not title_elem:
continue
title = title_elem.get_text(strip=True)
link = title_elem.get('href', '')
# แก้ไข relative URL
if link and not link.startswith('http'):
link = f"https://finance.yahoo.com{link}"
# หา description/summary
desc_elem = item.find('p')
description = desc_elem.get_text(strip=True) if desc_elem else ""
if title and len(title) > 10: # Filter out invalid titles
news_items.append({
'title': title,
'description': description,
'link': link,
'source': 'Yahoo Finance'
})
except Exception as e:
continue
# ถ้าไม่เจอข่าว ลองวิธีอื่น
if not news_items:
news_items = self._fallback_scrape(soup, max_items)
return news_items[:max_items]
except Exception as e:
print(f"Error scraping news: {str(e)}")
return self._get_sample_news()
def _fallback_scrape(self, soup, max_items: int) -> List[Dict]:
"""วิธีสำรอง: หา headlines ทั้งหมด"""
news_items = []
# หาทุก link ที่มี text ยาวพอ
all_links = soup.find_all('a')
for link in all_links:
text = link.get_text(strip=True)
href = link.get('href', '')
if len(text) > 20 and ('news' in href or 'article' in href):
if not href.startswith('http'):
href = f"https://finance.yahoo.com{href}"
news_items.append({
'title': text,
'description': '',
'link': href,
'source': 'Yahoo Finance'
})
if len(news_items) >= max_items:
break
return news_items
def _get_sample_news(self) -> List[Dict]:
"""ข่าวตัวอย่างในกรณีที่ scrape ไม่สำเร็จ"""
return [
{
'title': 'Stock Market Rallies on Strong Economic Data',
'description': 'Major indices posted significant gains as investors reacted positively to economic indicators.',
'link': 'https://finance.yahoo.com',
'source': 'Yahoo Finance (Sample)'
},
{
'title': 'Tech Stocks Lead Market Higher Amid AI Boom',
'description': 'Technology sector outperforms as artificial intelligence investments surge.',
'link': 'https://finance.yahoo.com',
'source': 'Yahoo Finance (Sample)'
},
{
'title': 'Federal Reserve Holds Interest Rates Steady',
'description': 'Central bank maintains current policy stance citing inflation concerns.',
'link': 'https://finance.yahoo.com',
'source': 'Yahoo Finance (Sample)'
}
]
def test_scraper():
"""ทดสอบ scraper"""
scraper = YahooFinanceNewsScraper()
news = scraper.scrape_news(query="technology", max_items=5)
print(f"Found {len(news)} news items:")
for i, item in enumerate(news, 1):
print(f"\n{i}. {item['title']}")
print(f" Link: {item['link']}")
if __name__ == "__main__":
test_scraper()