antiscam-pro/api/endpoints.py

# api/endpoints.py
import re
import os
import requests
import base64
from .utils import load_keywords, load_numbers, load_domains, load_shorteners, resolve_redirect
from .validators import Validators
from transformers import pipeline
import logging

logger = logging.getLogger(__name__)

# Import scanning function from a separate virustotal.py
from .virustotal import scan_url_with_virustotal

# VirusTotal API key is not needed here since the function in virustotal.py uses it

# Initialize AI model
try:
    # suppress_warnings=True silences warnings during model loading
    spam_classifier = pipeline("text-classification", model="mrm8488/bert-tiny-finetuned-sms-spam-detection")
    logger.info("AI spam detection model loaded successfully.")
except Exception as e:
    logger.warning(f"Failed to load AI model: {e}")
    spam_classifier = None

# check_message - LOGIC FUNCTION
def check_message(text: str) -> dict:
    """Checks a text message for spam and suspicious words."""
    result = {
        "suspicious_words": [],
        "ai_result": None,
        "is_suspicious": False
    }

    keywords = load_keywords()  # Assume load_keywords() works correctly
    keywords = [kw.strip() for kw in keywords if kw.strip()]  # Filter empty keywords

    # Check for suspicious keywords (case-insensitive)
    found_keywords = [kw for kw in keywords if kw.lower() in text.lower()]
    result["suspicious_words"] = found_keywords

    if spam_classifier:
        try:
            # Trim message to first 512 tokens for BERT
            max_len = 512
            if len(text.split()) > max_len:
                text_for_ai = " ".join(text.split()[:max_len])
                logger.warning(f"Message trimmed to {max_len} words for AI analysis.")
            else:
                text_for_ai = text

            prediction = spam_classifier(text_for_ai)[0]
            label = prediction["label"]
            confidence = round(prediction["score"], 4)
            result["ai_result"] = {
                "label": "SPAM" if label == "LABEL_1" else "HAM",  # LABEL_1 assumed to be SPAM
                "confidence": confidence
            }

            # Suspicion if AI >= 0.6 OR keywords found
            result["is_suspicious"] = (label == "LABEL_1" and confidence >= 0.6) or bool(result["suspicious_words"])
        except Exception as ai_e:
            logger.error(f"Error during AI message classification: {ai_e}")
            result["ai_result"] = {"label": "ERROR", "confidence": 0}
            # If AI fails, use only keyword check
            result["is_suspicious"] = bool(result["suspicious_words"])
    else:
        result["is_suspicious"] = bool(result["suspicious_words"])

    logger.info(f"check_message result: {result}")
    return result

# check_phone - LOGIC FUNCTION
def check_phone(number: str) -> dict:
    """Checks a phone number for valid format and known scams."""
    known_scams = load_numbers()
    known_scams = {num.strip() for num in known_scams if num.strip()}

    is_valid = Validators.is_valid_phone(number)
    number_for_check = number.lstrip('+')  # Remove '+' for database check
    is_suspicious = number_for_check in known_scams

    result = {
        "is_valid": is_valid,
        "is_suspicious": is_suspicious
    }
    logger.info(f"check_phone result for {number}: {result}")
    return result

# check_link - LOGIC FUNCTION
def check_link(link: str) -> dict:
    """Checks a URL for suspicious patterns, known scam domains, shorteners, and scans with VirusTotal."""
    logger.info(f"Checking link: {link}")

    try:
        link_resolved = resolve_redirect(link)
        logger.info(f"Resolved link: {link_resolved}")
    except Exception as resolve_e:
        logger.error(f"Error resolving link {link}: {resolve_e}")
        return {
            "is_valid": False,
            "is_suspicious": True,
            "details": [{
                "text": f"Error resolving redirect: {resolve_e}",
                "data-pl": f"Błąd rozwiązywania przekierowania: {resolve_e}",
                "data-en": f"Error resolving redirect: {resolve_e}"
            }],
            "source": "local"
        }

    link_clean = link_resolved.lower()
    suspicious_reasons = []

    is_valid = Validators.is_valid_url(link_resolved)
    if not is_valid:
        logger.warning(f"Resolved link is not a valid URL: {link_resolved}")
        return {
            "is_valid": False,
            "is_suspicious": False,
            "details": [{
                "text": "Invalid resolved URL format",
                "data-pl": "Niepoprawny format rozwiązanego URL",
                "data-en": "Invalid resolved URL format"
            }],
            "source": "local"
        }

    # Local analysis - suspicious patterns in URL
    suspicious_patterns = ["free", "gift", "login", "verify", "paypal", "paypa1", "bank", ".ru", ".cn"]
    for pattern in suspicious_patterns:
        if pattern in link_clean:
            suspicious_reasons.append({
                "text": f"Suspicious pattern found in URL: {pattern}",
                "data-pl": f"Podejrzany wzorzec '{pattern}' znaleziony w adresie URL.",
                "data-en": f"Suspicious pattern '{pattern}' found in the URL."
            })
            logger.info(f"Suspicious pattern '{pattern}' found in link: {link_resolved}")

    # Local analysis - scam domains from database
    scam_domains = load_domains()
    scam_domains = [domain.strip() for domain in scam_domains if domain.strip()]
    for domain in scam_domains:
        if domain and domain in link_clean:
            suspicious_reasons.append({
                "text": f"Domain '{domain}' marked as suspicious in our database.",
                "data-pl": f"Domena '{domain}' oznaczona jako podejrzana w naszej bazie danych.",
                "data-en": f"Domain '{domain}' marked as suspicious in our database."
            })
            logger.info(f"Suspicious domain '{domain}' found in link: {link_resolved}")

    # Local analysis - URL shorteners (check original link)
    shorteners = load_shorteners()
    shorteners = [short.strip() for short in shorteners if short.strip()]
    is_shortened = False
    for short in shorteners:
        if short and short in link.lower():
            is_shortened = True
            suspicious_reasons.append({
                "text": f"Link shortener detected in original link: {short}",
                "data-pl": f"Wykryto usługę skracania URL w oryginalnym linku: {short}",
                "data-en": f"Link shortener detected in original link: {short}"
            })
            logger.info(f"URL shortener '{short}' detected in original link: {link}")

    local_is_suspicious = bool(suspicious_reasons)

    # VirusTotal scan: only if local suspicious findings or short link
    vt_result = {"detected": False, "positives": 0, "total": 0, "scan_date": "N/A", "error": "Not scanned locally"}
    if local_is_suspicious or is_shortened:
        logger.info(f"Local suspicious or short link -> checking VirusTotal for: {link_resolved}")
        vt_result = scan_url_with_virustotal(link_resolved)
        logger.info(f"VirusTotal result for {link_resolved}: {vt_result}")

    vt_detected = vt_result.get("detected", False)
    vt_positives = vt_result.get("positives", 0)
    vt_total = vt_result.get("total", 0)
    vt_scan_date = vt_result.get("scan_date", "N/A")
    vt_error = vt_result.get("error")

    combined_details = []

    if local_is_suspicious:
        combined_details.extend(suspicious_reasons)

    if vt_result.get('error') != "Not scanned locally":
        if vt_error and vt_error != "Not scanned locally":
            vt_summary_text = f"VirusTotal: Error - {vt_error}"
            vt_summary_pl = f"VirusTotal: Błąd - {vt_error}"
            vt_summary_en = f"VirusTotal: Error - {vt_error}"
        else:
            vt_summary_text = f"VirusTotal: {vt_positives} / {vt_total} engines flagged the link."
            vt_summary_pl = f"VirusTotal: {vt_positives} / {vt_total} silników oznaczyło link."
            vt_summary_en = f"VirusTotal: {vt_positives} / {vt_total} engines flagged the link."

            if vt_scan_date and vt_scan_date != "N/A":
                vt_summary_text += f" Scan date: {vt_scan_date}."
                vt_summary_pl += f" Scan date: {vt_scan_date}."
                vt_summary_en += f" Scan date: {vt_scan_date}."

        combined_details.append({
            "text": vt_summary_text,
            "data-pl": vt_summary_pl,
            "data-en": vt_summary_en
        })
    else:
        logger.info(f"VirusTotal scan skipped for link: {link_resolved} (local analysis clean and not shortened).")

    final_is_suspicious = local_is_suspicious or vt_detected

    if not combined_details:
        combined_details.append({
            "text": "No suspicious activity detected based on available checks.",
            "data-pl": "Nie wykryto podejrzanej aktywności na podstawie dostępnych sprawdzeń.",
            "data-en": "No suspicious activity detected based on available checks."
        })
        source = "none"
    elif local_is_suspicious and vt_detected:
        source = "combined"
    elif vt_detected:
        source = "virustotal"
    elif local_is_suspicious:
        source = "local"
    else:
        source = "unknown"

    result = {
        "is_valid": is_valid,
        "is_suspicious": final_is_suspicious,
        "details": combined_details,
        "source": source
    }
    logger.info(f"Final result for check_link ({link}): {result}")
    return result