import requests from bs4 import BeautifulSoup from typing import List, Dict import time from config import YAHOO_FINANCE_NEWS_URL, HEADERS, MAX_NEWS_ITEMS class YahooFinanceNewsScraper: def __init__(self): self.base_url = YAHOO_FINANCE_NEWS_URL self.headers = HEADERS def scrape_news(self, query: str = "", max_items: int = MAX_NEWS_ITEMS) -> List[Dict]: """ ดึงข่าวจาก Yahoo Finance Args: query: คำค้นหา (ถ้าไม่ระบุจะดึงข่าวหน้าแรก) max_items: จำนวนข่าวที่ต้องการ Returns: List of dictionaries containing news data """ try: # สร้าง URL if query: search_url = f"https://finance.yahoo.com/search?q={query.replace(' ', '+')}" else: search_url = self.base_url # ส่ง request response = requests.get(search_url, headers=self.headers, timeout=10) response.raise_for_status() # Parse HTML soup = BeautifulSoup(response.content, 'html.parser') news_items = [] # ค้นหา news items (Yahoo Finance ใช้ structure ต่างๆ) # ลอง selector หลายแบบ article_containers = ( soup.find_all('div', class_='Ov(h)') or soup.find_all('div', class_='js-stream-content') or soup.find_all('li', class_='js-stream-content') or soup.find_all('h3') ) for item in article_containers[:max_items]: try: # หา title title_elem = item.find('a') or item.find('h3') if not title_elem: continue title = title_elem.get_text(strip=True) link = title_elem.get('href', '') # แก้ไข relative URL if link and not link.startswith('http'): link = f"https://finance.yahoo.com{link}" # หา description/summary desc_elem = item.find('p') description = desc_elem.get_text(strip=True) if desc_elem else "" if title and len(title) > 10: # Filter out invalid titles news_items.append({ 'title': title, 'description': description, 'link': link, 'source': 'Yahoo Finance' }) except Exception as e: continue # ถ้าไม่เจอข่าว ลองวิธีอื่น if not news_items: news_items = self._fallback_scrape(soup, max_items) return news_items[:max_items] except Exception as e: print(f"Error scraping news: {str(e)}") return self._get_sample_news() def _fallback_scrape(self, soup, max_items: int) -> List[Dict]: """วิธีสำรอง: หา headlines ทั้งหมด""" news_items = [] # หาทุก link ที่มี text ยาวพอ all_links = soup.find_all('a') for link in all_links: text = link.get_text(strip=True) href = link.get('href', '') if len(text) > 20 and ('news' in href or 'article' in href): if not href.startswith('http'): href = f"https://finance.yahoo.com{href}" news_items.append({ 'title': text, 'description': '', 'link': href, 'source': 'Yahoo Finance' }) if len(news_items) >= max_items: break return news_items def _get_sample_news(self) -> List[Dict]: """ข่าวตัวอย่างในกรณีที่ scrape ไม่สำเร็จ""" return [ { 'title': 'Stock Market Rallies on Strong Economic Data', 'description': 'Major indices posted significant gains as investors reacted positively to economic indicators.', 'link': 'https://finance.yahoo.com', 'source': 'Yahoo Finance (Sample)' }, { 'title': 'Tech Stocks Lead Market Higher Amid AI Boom', 'description': 'Technology sector outperforms as artificial intelligence investments surge.', 'link': 'https://finance.yahoo.com', 'source': 'Yahoo Finance (Sample)' }, { 'title': 'Federal Reserve Holds Interest Rates Steady', 'description': 'Central bank maintains current policy stance citing inflation concerns.', 'link': 'https://finance.yahoo.com', 'source': 'Yahoo Finance (Sample)' } ] def test_scraper(): """ทดสอบ scraper""" scraper = YahooFinanceNewsScraper() news = scraper.scrape_news(query="technology", max_items=5) print(f"Found {len(news)} news items:") for i, item in enumerate(news, 1): print(f"\n{i}. {item['title']}") print(f" Link: {item['link']}") if __name__ == "__main__": test_scraper()