#!/usr/bin/env python3
"""
Simple TripAdvisor Restaurant Scraper - Clean Version
Works with requests and beautifulsoup - avoids complex typing issues
"""

import requests
import json
import time
import os
import uuid
import re
from datetime import datetime
from urllib.parse import urljoin, urlparse, quote_plus
from typing import List, Dict, Optional, Any
from sqlalchemy import create_engine, text
from bs4 import BeautifulSoup

class RestaurantData:
    """Simple restaurant data class"""
    def __init__(self, name: str):
        self.name = name
        self.description = ""
        self.cuisine_type = ""
        self.opening_time = None
        self.closing_time = None
        self.phone = ""
        self.address = ""
        self.price_range = ""
        self.rating = 0.0
        self.review_count = 0
        self.photos = []
        self.menu_items = []
        self.tripadvisor_url = ""

class AthensRestaurantScraper:
    """Simple restaurant scraper for Athens"""
    
    def __init__(self):
        self.session = requests.Session()
        self.session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
            'Accept-Language': 'en-US,en;q=0.9',
            'Connection': 'keep-alive',
        })
        
        # Database
        self.DATABASE_URL = f"postgresql://postgres:{quote_plus('F@f@k0s!!')}@localhost:5432/bookbeach"
        self.engine = create_engine(self.DATABASE_URL)
        
        # Photos directory
        self.photo_dir = "restaurant_photos"
        os.makedirs(self.photo_dir, exist_ok=True)
    
    def get_page(self, url: str) -> Optional[BeautifulSoup]:
        """Get page safely"""
        try:
            response = self.session.get(url, timeout=15)
            response.raise_for_status()
            return BeautifulSoup(response.content, 'html.parser')
        except Exception as e:
            print(f"❌ Failed to get {url}: {e}")
            return None
    
    def extract_restaurant_urls(self, base_url: str, pages: int = 15) -> List[str]:
        """Get restaurant URLs from search pages"""
        urls = set()
        
        for page_num in range(pages):
            offset = page_num * 30
            if page_num == 0:
                page_url = base_url
            else:
                page_url = base_url.replace('-oa30-', f'-oa{offset}-')
            
            print(f"📄 Scraping page {page_num + 1}...")
            soup = self.get_page(page_url)
            if not soup:
                continue
            
            # Find all links
            links = soup.find_all('a', href=True)
            page_count = 0
            
            for link in links:
                href_attr = link.get('href')
                if not href_attr:
                    continue
                
                # Convert to string if needed
                href = str(href_attr)
                
                # Check if it's a restaurant link
                if '/Restaurant_Review' in href or ('-d' in href and 'Restaurant' in href):
                    if href.startswith('/'):
                        full_url = 'https://www.tripadvisor.com' + href
                    else:
                        full_url = href
                    
                    # Clean URL
                    if '?' in full_url:
                        full_url = full_url.split('?')[0]
                    
                    urls.add(full_url)
                    page_count += 1
            
            print(f"   Found {page_count} restaurants on page {page_num + 1}")
            time.sleep(2)
        
        url_list = list(urls)
        print(f"✅ Total restaurant URLs: {len(url_list)}")
        return url_list
    
    def extract_restaurant_info(self, url: str) -> Optional[RestaurantData]:
        """Extract restaurant information"""
        soup = self.get_page(url)
        if not soup:
            return None
        
        try:
            # Name
            name = "Unknown Restaurant"
            h1_tags = soup.find_all('h1')
            for h1 in h1_tags:
                text = h1.get_text(strip=True)
                if text and len(text) > 3:
                    name = text
                    break
            
            restaurant = RestaurantData(name)
            restaurant.tripadvisor_url = url
            
            # Description - look for paragraphs with substantial text
            paragraphs = soup.find_all('p')
            for p in paragraphs:
                text = p.get_text(strip=True)
                if len(text) > 50 and len(text) < 500:
                    restaurant.description = text
                    break
            
            # Address - look for text that looks like addresses
            all_text = soup.get_text()
            address_patterns = [
                r'([A-Za-z\s,]+\d+[A-Za-z\s,]*Athens[A-Za-z\s,]*\d{3,5}[A-Za-z\s,]*Greece)',
                r'([\w\s,]+Athens[\w\s,]*\d{3,5})',
                r'([A-Za-z\s]+\d+[A-Za-z\s,]*Athens)'
            ]
            
            for pattern in address_patterns:
                match = re.search(pattern, all_text, re.IGNORECASE)
                if match:
                    restaurant.address = match.group(1).strip()
                    break
            
            # Cuisine type - look for food-related keywords
            food_keywords = ['greek', 'italian', 'mediterranean', 'seafood', 'traditional', 'modern', 'european', 'international']
            text_lower = all_text.lower()
            for keyword in food_keywords:
                if keyword in text_lower:
                    restaurant.cuisine_type = keyword.capitalize()
                    break
            
            # Rating - look for rating patterns
            rating_text = re.findall(r'(\d\.\d)\s*out\s*of\s*5', all_text, re.IGNORECASE)
            if rating_text:
                try:
                    restaurant.rating = float(rating_text[0])
                except:
                    pass
            
            # Review count
            review_matches = re.findall(r'(\d+)\s*review', all_text, re.IGNORECASE)
            if review_matches:
                try:
                    restaurant.review_count = int(review_matches[0])
                except:
                    pass
            
            # Photos - get image URLs
            images = soup.find_all('img')
            for img in images[:8]:  # Limit to 8 photos
                src_attr = img.get('src')
                if src_attr:
                    src = str(src_attr)
                    if any(keyword in src.lower() for keyword in ['restaurant', 'food', 'photo', 'image']):
                        if src.startswith('//'):
                            src = 'https:' + src
                        elif src.startswith('/'):
                            src = 'https://www.tripadvisor.com' + src
                        restaurant.photos.append(src)
            
            # Menu items with prices
            price_matches = re.findall(r'([€$£¥]\s*\d+(?:[.,]\d{2})?)', all_text)
            for i, price_match in enumerate(price_matches[:15]):  # Limit menu items
                try:
                    price_str = re.sub(r'[€$£¥]', '', price_match).strip()
                    price = float(price_str.replace(',', '.'))
                    
                    menu_item = {
                        'name': f'Menu Item {i+1}',
                        'price': price,
                        'description': '',
                        'category': restaurant.cuisine_type or 'General'
                    }
                    restaurant.menu_items.append(menu_item)
                except:
                    continue
            
            if restaurant.name != "Unknown Restaurant":
                print(f"✅ {restaurant.name} - {restaurant.rating}⭐ ({len(restaurant.photos)} photos, {len(restaurant.menu_items)} items)")
                return restaurant
            
        except Exception as e:
            print(f"❌ Error parsing {url}: {e}")
        
        return None
    
    def download_photo(self, url: str, restaurant_name: str, index: int) -> Optional[str]:
        """Download photo"""
        try:
            # Safe filename
            safe_name = re.sub(r'[^\w\s-]', '', restaurant_name)
            safe_name = re.sub(r'[-\s]+', '-', safe_name)[:30]
            
            filename = f"{safe_name}_{index}.jpg"
            filepath = os.path.join(self.photo_dir, filename)
            
            if os.path.exists(filepath):
                return filepath
            
            response = self.session.get(url, timeout=10)
            response.raise_for_status()
            
            with open(filepath, 'wb') as f:
                f.write(response.content)
            
            print(f"📷 Downloaded: {filename}")
            return filepath
            
        except Exception as e:
            print(f"❌ Photo download failed: {e}")
            return None
    
    def save_restaurants(self, restaurants: List[RestaurantData]):
        """Save to database"""
        with self.engine.connect() as db:
            # Get company ID
            company_id = "c35388d2-0028-4002-bcc4-db4d7ed2042e"
            
            # Get Athens beach place
            beach_result = db.execute(text("SELECT beach_place_id FROM beach_places WHERE city ILIKE '%athens%' LIMIT 1")).fetchone()
            beach_place_id = beach_result[0] if beach_result else None
            
            # Get EUR currency
            currency_id = 1  # EUR
            
            saved = 0
            for restaurant in restaurants:
                try:
                    # Check if exists
                    existing = db.execute(text("SELECT 1 FROM restaurants WHERE restaurant_name = :name"), 
                                        {'name': restaurant.name}).fetchone()
                    if existing:
                        continue
                    
                    restaurant_id = str(uuid.uuid4())
                    
                    # Insert restaurant
                    db.execute(text("""
                        INSERT INTO restaurants (
                            restaurant_id, restaurant_name, company_id, beach_place_id,
                            description, cuisine_type, is_active, created_at
                        ) VALUES (
                            :restaurant_id, :name, :company_id, :beach_place_id,
                            :description, :cuisine_type, true, NOW()
                        )
                    """), {
                        'restaurant_id': restaurant_id,
                        'name': restaurant.name,
                        'company_id': company_id,
                        'beach_place_id': beach_place_id,
                        'description': restaurant.description,
                        'cuisine_type': restaurant.cuisine_type or 'Mediterranean'
                    })
                    
                    # Create category
                    category_id = int(time.time() * 1000) % 2000000000
                    try:
                        db.execute(text("""
                            INSERT INTO restaurant_categories (
                                category_id, restaurant_id, category_name
                            ) VALUES (:category_id, :restaurant_id, :category_name)
                        """), {
                            'category_id': category_id,
                            'restaurant_id': restaurant_id,
                            'category_name': restaurant.cuisine_type or 'General'
                        })
                    except:
                        category_id = 1
                    
                    # Insert menu items
                    for item in restaurant.menu_items:
                        try:
                            item_id = str(uuid.uuid4())
                            db.execute(text("""
                                INSERT INTO restaurant_items (
                                    item_id, restaurant_id, category_id, item_name,
                                    price, currency_id, is_available, created_at
                                ) VALUES (
                                    :item_id, :restaurant_id, :category_id, :item_name,
                                    :price, :currency_id, true, NOW()
                                )
                            """), {
                                'item_id': item_id,
                                'restaurant_id': restaurant_id,
                                'category_id': category_id,
                                'item_name': item['name'],
                                'price': item['price'],
                                'currency_id': currency_id
                            })
                        except:
                            pass
                    
                    # Download photos
                    for i, photo_url in enumerate(restaurant.photos):
                        self.download_photo(photo_url, restaurant.name, i)
                    
                    db.commit()
                    saved += 1
                    print(f"💾 Saved: {restaurant.name}")
                    
                except Exception as e:
                    db.rollback()
                    print(f"❌ Save failed for {restaurant.name}: {e}")
            
            print(f"✅ Saved {saved}/{len(restaurants)} restaurants to database")
    
    def scrape_athens_restaurants(self, target: int = 500):
        """Main scraping method"""
        print(f"🚀 Scraping Athens restaurants (target: {target})")
        
        base_url = "https://www.tripadvisor.com/Restaurants-g189400-oa30-zfp10954-Athens_Attica.html"
        
        # Get URLs
        urls = self.extract_restaurant_urls(base_url, pages=20)
        
        # Process restaurants
        restaurants = []
        for i, url in enumerate(urls[:target + 50], 1):
            print(f"📊 Processing {i}: {len(restaurants)} valid restaurants")
            
            restaurant = self.extract_restaurant_info(url)
            if restaurant:
                restaurants.append(restaurant)
            
            # Save in batches
            if len(restaurants) % 20 == 0 and restaurants:
                self.save_restaurants(restaurants[-20:])
            
            if len(restaurants) >= target:
                break
            
            time.sleep(1)
        
        # Save remaining
        remaining = len(restaurants) % 20
        if remaining > 0:
            self.save_restaurants(restaurants[-remaining:])
        
        print(f"🎉 Completed! Scraped {len(restaurants)} restaurants")
        return restaurants

def main():
    """Run the scraper"""
    scraper = AthensRestaurantScraper()
    
    try:
        restaurants = scraper.scrape_athens_restaurants(target=500)
        
        print(f"\n✅ FINAL RESULTS:")
        print(f"   🏪 Restaurants: {len(restaurants)}")
        print(f"   📷 Photos in: {scraper.photo_dir}")
        print(f"   💾 Data saved to database")
        
    except Exception as e:
        print(f"❌ Error: {e}")
        import traceback
        traceback.print_exc()

if __name__ == "__main__":
    main()