| | import requests |
| | from bs4 import BeautifulSoup |
| | from typing import List, Dict |
| | import time |
| | from config import YAHOO_FINANCE_NEWS_URL, HEADERS, MAX_NEWS_ITEMS |
| |
|
| | class YahooFinanceNewsScraper: |
| | def __init__(self): |
| | self.base_url = YAHOO_FINANCE_NEWS_URL |
| | self.headers = HEADERS |
| | |
| | def scrape_news(self, query: str = "", max_items: int = MAX_NEWS_ITEMS) -> List[Dict]: |
| | """ |
| | ดึงข่าวจาก Yahoo Finance |
| | |
| | Args: |
| | query: คำค้นหา (ถ้าไม่ระบุจะดึงข่าวหน้าแรก) |
| | max_items: จำนวนข่าวที่ต้องการ |
| | |
| | Returns: |
| | List of dictionaries containing news data |
| | """ |
| | try: |
| | |
| | if query: |
| | search_url = f"https://finance.yahoo.com/search?q={query.replace(' ', '+')}" |
| | else: |
| | search_url = self.base_url |
| | |
| | |
| | response = requests.get(search_url, headers=self.headers, timeout=10) |
| | response.raise_for_status() |
| | |
| | |
| | soup = BeautifulSoup(response.content, 'html.parser') |
| | |
| | news_items = [] |
| | |
| | |
| | |
| | article_containers = ( |
| | soup.find_all('div', class_='Ov(h)') or |
| | soup.find_all('div', class_='js-stream-content') or |
| | soup.find_all('li', class_='js-stream-content') or |
| | soup.find_all('h3') |
| | ) |
| | |
| | for item in article_containers[:max_items]: |
| | try: |
| | |
| | title_elem = item.find('a') or item.find('h3') |
| | if not title_elem: |
| | continue |
| | |
| | title = title_elem.get_text(strip=True) |
| | link = title_elem.get('href', '') |
| | |
| | |
| | if link and not link.startswith('http'): |
| | link = f"https://finance.yahoo.com{link}" |
| | |
| | |
| | desc_elem = item.find('p') |
| | description = desc_elem.get_text(strip=True) if desc_elem else "" |
| | |
| | if title and len(title) > 10: |
| | news_items.append({ |
| | 'title': title, |
| | 'description': description, |
| | 'link': link, |
| | 'source': 'Yahoo Finance' |
| | }) |
| | |
| | except Exception as e: |
| | continue |
| | |
| | |
| | if not news_items: |
| | news_items = self._fallback_scrape(soup, max_items) |
| | |
| | return news_items[:max_items] |
| | |
| | except Exception as e: |
| | print(f"Error scraping news: {str(e)}") |
| | return self._get_sample_news() |
| | |
| | def _fallback_scrape(self, soup, max_items: int) -> List[Dict]: |
| | """วิธีสำรอง: หา headlines ทั้งหมด""" |
| | news_items = [] |
| | |
| | |
| | all_links = soup.find_all('a') |
| | |
| | for link in all_links: |
| | text = link.get_text(strip=True) |
| | href = link.get('href', '') |
| | |
| | if len(text) > 20 and ('news' in href or 'article' in href): |
| | if not href.startswith('http'): |
| | href = f"https://finance.yahoo.com{href}" |
| | |
| | news_items.append({ |
| | 'title': text, |
| | 'description': '', |
| | 'link': href, |
| | 'source': 'Yahoo Finance' |
| | }) |
| | |
| | if len(news_items) >= max_items: |
| | break |
| | |
| | return news_items |
| | |
| | def _get_sample_news(self) -> List[Dict]: |
| | """ข่าวตัวอย่างในกรณีที่ scrape ไม่สำเร็จ""" |
| | return [ |
| | { |
| | 'title': 'Stock Market Rallies on Strong Economic Data', |
| | 'description': 'Major indices posted significant gains as investors reacted positively to economic indicators.', |
| | 'link': 'https://finance.yahoo.com', |
| | 'source': 'Yahoo Finance (Sample)' |
| | }, |
| | { |
| | 'title': 'Tech Stocks Lead Market Higher Amid AI Boom', |
| | 'description': 'Technology sector outperforms as artificial intelligence investments surge.', |
| | 'link': 'https://finance.yahoo.com', |
| | 'source': 'Yahoo Finance (Sample)' |
| | }, |
| | { |
| | 'title': 'Federal Reserve Holds Interest Rates Steady', |
| | 'description': 'Central bank maintains current policy stance citing inflation concerns.', |
| | 'link': 'https://finance.yahoo.com', |
| | 'source': 'Yahoo Finance (Sample)' |
| | } |
| | ] |
| |
|
| | def test_scraper(): |
| | """ทดสอบ scraper""" |
| | scraper = YahooFinanceNewsScraper() |
| | news = scraper.scrape_news(query="technology", max_items=5) |
| | |
| | print(f"Found {len(news)} news items:") |
| | for i, item in enumerate(news, 1): |
| | print(f"\n{i}. {item['title']}") |
| | print(f" Link: {item['link']}") |
| |
|
| | if __name__ == "__main__": |
| | test_scraper() |