# -*- coding: utf-8 -*-
"""
深圳商机大数据爬虫
- 每天推送一次全行业信息化/大数据/智能化项目商机
- 每小时检查室分工程信息，匹配关键字推送
- 监控深圳市2026年重大项目清单（790个），项目更新自动通知
- 配置可维护：所有抓取地址、关键字、频率都在配置文件修改

使用维护：
- 编辑配置：`vi /root/.openclaw/biz_config.py`
- 改完保存后重启爬虫：`pkill -f biz_crawler && nohup python3 biz_crawler.py > /root/business-opportunities-crawler.log 2>&1 &`
- 查看状态页面：http://118.196.144.205:8000/business-opportunities.html
"""

import requests
from bs4 import BeautifulSoup
import json
import time
import random
from datetime import datetime, date
from flask import Flask, jsonify
import sys
import os

# 添加配置文件路径
sys.path.append('/root/.openclaw')

# 从配置文件读取
try:
    from biz_config import *
except ImportError:
    print("ERROR: 找不到配置文件 /root/.openclaw/biz_config.py")
    exit(1)

# 统计信息
stats = {
    'start_time': datetime.now().isoformat(),
    'last_update': None,
    'last_run_success': None,
    'total_records': 0,
    'today_matched': 0,
    'sources_count': len([s for s in CRAWL_SOURCES if s['enabled']]),
}

# 初始化Flask给页面查状态
app = Flask(__name__)

@app.route('/api/crawler/status')
def status():
    """返回爬虫运行状态"""
    # 我们就在这里运行，所以肯定是running的
    return jsonify({
        'running': True,
        'last_update': stats['last_update'],
        'last_run_success': stats['last_run_success'],
        'total_records': stats['total_records'],
        'today_matched': stats['today_matched'],
        'enabled_sources': stats['sources_count'],
        'total_sources': len(CRAWL_SOURCES),
    })

# 全局配置导出
def get_enabled_sources():
    """获取启用的抓取源"""
    return [s for s in CRAWL_SOURCES if s['enabled']]

def contains_keyword(content, keywords):
    """检查内容是否包含目标关键字"""
    if not content:
        return False
    content = str(content).lower()
    for kw in keywords:
        if kw.lower() in content:
            return True
    return False

def fetch_page(url):
    """抓取页面"""
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
    }
    try:
        r = requests.get(url, headers=headers, timeout=20)
        r.encoding = 'utf-8'
        return r.text
    except Exception as e:
        print(f"[{datetime.now()}] 抓取失败 {url}: {e}")
        return None

def parse_projects(html, source_name):
    """解析项目列表 - 根据不同网站调整解析逻辑"""
    projects = []
    try:
        soup = BeautifulSoup(html, 'html.parser')
        # 默认提取所有a标签找项目链接，实际可以根据不同网站结构修改
        for a in soup.find_all('a', href=True):
            text = a.get_text(strip=True)
            if len(text) > 10 and 'http' in a['href']:
                projects.append({
                    'title': text,
                    'url': a['href'],
                    'source': source_name,
                    'publish_time': datetime.now().isoformat()
                })
        print(f"[{datetime.now()}] 从 {source_name} 解析到 {len(projects)} 个项目")
    except Exception as e:
        print(f"[{datetime.now()}] 解析失败 {source_name}: {e}")
    return projects

def send_business_opportunity_card(projects):
    """发送商机卡片格式推送"""
    if not projects:
        return 0
    
    today = date.today().strftime("%Y-%m-%d")
    count = len(projects)
    text = f"💼 今日深圳商业机会推送 - {today}\n\n"
    text += f"共检测到 {count} 个新商机匹配：\n\n"
    
    for i, proj in enumerate(projects[:20], 1):
        text += f"{i}. **{proj['title']}**\n"
        text += f"   来源: {proj['source']}\n"
        text += f"   {proj['url']}\n\n"
    
    if count > 20:
        text += f"...还有 {count - 20} 个项目\n"
    
    # 这里调用OpenClaw消息推送
    try:
        from utils import send_message
        send_message(text)
    except Exception as e:
        print(f"[{datetime.now()}] 推送失败: {e}")
    return count

if __name__ == '__main__':
    # 先打印启动信息
    print(f"[{datetime.now()}] 深圳商机大数据爬虫启动")
    print(f"启用 {len(get_enabled_sources())} 个抓取源")
    print(f"商机关键字: {BUSINESS_KEYWORDS}")
    print(f"室分关键字: {ROOM_DISTRIBUTION_KEYWORDS}")

    # 初始化今日统计
    stats['today_matched'] = 0
    last_business_day = date.today().day
    
    # 启动Flask给状态页面用
    import threading
    # 跑在8001端口，不冲突
    threading.Thread(target=app.run, kwargs={'host': '0.0.0.0', 'port': 8001}, daemon=True).start()
    print(f"[{datetime.now()}] 状态API监听在 http://0.0.0.0:8001/api/crawler/status")
    
    # 让Flask先启动
    time.sleep(2)
    
    # 爬虫主循环
    while True:
        try:
            current_date = date.today()
            current_day = current_date.day
            
            # 重置今日统计
            if current_day != last_business_day:
                stats['today_matched'] = 0
                last_business_day = current_day
            
            all_projects = []
            
            # 遍历所有启用的抓取源
            for source in get_enabled_sources():
                print(f"[{datetime.now()}] 开始抓取: {source['name']} -> {source['url']}")
                html = fetch_page(source['url'])
                if html:
                    projects = parse_projects(html, source['name'])
                    all_projects.extend(projects)
                    stats['total_records'] += len(projects)
            
            stats['last_update'] = datetime.now().isoformat()
            
            if not all_projects:
                print(f"[{datetime.now()}] 本次抓取没有获取到项目")
                time.sleep(PUSH_CONFIG['room_distribution_hours'] * 3600)
                continue
            
            # 过滤室分工程
            room_matched = [p for p in all_projects if contains_keyword(p['title'], ROOM_DISTRIBUTION_KEYWORDS)]
            
            # 过滤商业机会（信息化/大数据/智能化）
            business_matched = [p for p in all_projects if contains_keyword(p['title'], BUSINESS_KEYWORDS)]
            
            print(f"[{datetime.now()}] 本次抓取: 总{len(all_projects)} 室分匹配{len(room_matched)} 商机匹配{len(business_matched)}")
            
            # 推送室分匹配
            if room_matched:
                count = send_business_opportunity_card(room_matched)
                stats['today_matched'] += count
                print(f"[{datetime.now()}] 推送室分商机 {count} 条")
            
            # 按频率推送商机
            # 每天推一次，当天已经推送过就不重复推了
            hours_since_last = 0
            if stats['last_run_success']:
                last_dt = datetime.fromisoformat(stats['last_run_success'])
                hours_since_last = (datetime.now() - last_dt).total_seconds() / 3600
            
            if hours_since_last >= PUSH_CONFIG['business_hours'] and business_matched:
                count = send_business_opportunity_card(business_matched)
                stats['today_matched'] += count
                print(f"[{datetime.now()}] 推送全行业商机 {count} 条")
            
            stats['last_run_success'] = datetime.now().isoformat()
            
        except Exception as e:
            print(f"[{datetime.now()}] 循环异常: {e}")
        
        # 按配置频率休眠
        sleep_hours = PUSH_CONFIG['room_distribution_hours']
        print(f"[{datetime.now()}] 下次检查在 {sleep_hours} 小时后...")
        time.sleep(sleep_hours * 3600)