38265-vm/core/views.py
2026-02-07 11:00:38 +00:00

218 lines
8.7 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import os
import re
import csv
import io
import time
import pandas as pd
import requests
from docx import Document
from django.shortcuts import render, redirect, get_object_or_404
from django.http import HttpResponse, FileResponse
from django.utils import timezone
from .models import ExtractionTask, ExtractedUser
def home(request):
"""Render the landing screen with the tool interface."""
tasks = ExtractionTask.objects.all().order_by('-created_at')[:10]
return render(request, "core/index.html", {"tasks": tasks})
def analyze(request):
if request.method == "POST":
task_type = request.POST.get("task_type", "fans")
raw_text = request.POST.get("raw_text", "").strip()
if not raw_text:
return redirect('home')
# Create task
task = ExtractionTask.objects.create(
task_type=task_type,
raw_text=raw_text
)
extracted_count = 0
found_ids = set()
# --- PHASE 1: ROBUST FANS/FOLLOWING PARSING ---
if task_type in ['fans', 'following']:
# Strategy A: Look for explicit ID markers
# Expected format: Nickname followed by ID line
lines = [l.strip() for l in raw_text.split('\n') if l.strip()]
for i, line in enumerate(lines):
xhs_id = None
nickname = "未知用户"
# Check for explicit ID marker in this line
match = re.search(r'(?:小红书号|ID|id)[:\s]*([a-zA-Z0-9_.-]{5,})', line, re.IGNORECASE)
if match:
xhs_id = match.group(1).strip()
# Nickname is likely the previous line
if i > 0:
nickname = lines[i-1]
if xhs_id and xhs_id not in found_ids:
# Clean nickname (remove ID if it's there)
nickname = re.sub(r'(?:小红书号|ID|id).*', '', nickname, flags=re.IGNORECASE).strip()
if not nickname: nickname = "小红书用户"
ExtractedUser.objects.create(
task=task,
nickname=nickname[:250],
xhs_id=xhs_id[:100],
)
found_ids.add(xhs_id)
extracted_count += 1
# Strategy B: If still nothing, look for "nickname / ID" pattern without markers
if extracted_count == 0:
for i in range(len(lines) - 1):
line1 = lines[i]
line2 = lines[i+1]
# If line2 looks like an ID (alphanumeric, 6-15 chars) and line1 is not too long
if re.match(r'^[a-zA-Z0-9_.-]{6,15}$', line2) and len(line1) < 40:
if line2 not in found_ids:
ExtractedUser.objects.create(
task=task,
nickname=line1[:250],
xhs_id=line2[:100],
)
found_ids.add(line2)
extracted_count += 1
# --- PHASE 2: ROBUST COMMENT PARSING ---
if task_type == 'comments' or extracted_count == 0:
# Pattern: [Nickname]
[Content]
[Time/Location]
# Time formats: 10-24, 2小时前, 昨天, 刚刚, 3天前
time_pattern = r'^(\d{2}-\d{2}|\d+[-天小分][前时钟]*|昨天|刚刚|\d{4}-\d{2}-\d{2}.*|IP.*)$'
lines = [l.strip() for l in raw_text.split('\n') if l.strip()]
i = 0
while i < len(lines) - 1:
nickname = lines[i]
potential_content = lines[i+1]
# Check if there's a third line for time
if i + 2 < len(lines) and re.match(time_pattern, lines[i+2]):
content = potential_content
time_info = lines[i+2]
if len(nickname) < 50:
ExtractedUser.objects.create(
task=task,
nickname=nickname[:250],
comment_text=f"[{time_info}] {content}"
)
extracted_count += 1
i += 3
continue
i += 1
# --- PHASE 3: FALLBACK & SMART LINK HANDLING ---
if extracted_count == 0:
all_urls = re.findall(r'https?://[^\s]+', raw_text)
for url in all_urls:
ExtractedUser.objects.create(
task=task,
nickname="待采集主页",
profile_url=url[:500],
comment_text="[智能识别] 已锁定目标。由于小红书加密机制,请点击「高精度修复」手动粘贴列表内容。"
)
extracted_count += 1
if not all_urls:
chunks = re.split(r'[\s,;]', raw_text)
for chunk in chunks:
chunk = chunk.strip()
if re.match(r'^[a-zA-Z0-9_.-]{6,20}$', chunk) and chunk not in found_ids:
ExtractedUser.objects.create(
task=task,
nickname="待分析用户",
xhs_id=chunk[:100],
)
found_ids.add(chunk)
extracted_count += 1
return redirect('task_detail', task_id=task.id)
return redirect('home')
def task_detail(request, task_id):
task = get_object_or_404(ExtractionTask, id=task_id)
users = task.users.all()
needs_paste = False
if task.users.count() <= 1 and len(task.raw_text) < 300:
needs_paste = True
return render(request, "core/task_detail.html", {
"task": task,
"users": users,
"needs_paste": needs_paste
})
def history(request):
tasks = ExtractionTask.objects.all().order_by('-created_at')
return render(request, "core/history.html", {"tasks": tasks})
def export_task(request, task_id, format):
task = get_object_or_404(ExtractionTask, id=task_id)
users = task.users.all()
data = []
for user in users:
row = {
"昵称": user.nickname,
"小红书ID": user.xhs_id,
"主页链接": user.profile_url,
"评论/备注": user.comment_text,
"提取时间": user.extracted_at.strftime('%Y-%m-%d %H:%M')
}
data.append(row)
if not data:
data = [{"昵称": "未提取到数据", "小红书ID": "-", "主页链接": "-"}]
df = pd.DataFrame(data)
timestamp = timezone.now().strftime('%Y%m%d_%H%M')
filename = f"xhs_{{task.task_type}}_{{timestamp}}"
if format == 'csv':
response = HttpResponse(content_type='text/csv')
response['Content-Disposition'] = f'attachment; filename="{{filename}}.csv"'
df.to_csv(path_or_buf=response, index=False, encoding='utf-8-sig')
return response
elif format == 'excel':
output = io.BytesIO()
with pd.ExcelWriter(output, engine='openpyxl') as writer:
df.to_excel(writer, index=False, sheet_name='Data')
output.seek(0)
response = HttpResponse(output.read(), content_type='application/vnd.openxmlformats-officedocument.spreadsheetml.sheet')
response['Content-Disposition'] = f'attachment; filename="{{filename}}.xlsx"'
return response
elif format == 'word':
doc = Document()
doc.add_heading(f'小红书数据导出 - {{task.get_task_type_display()}}', 0)
doc.add_paragraph(f'导出时间: {{timezone.now().strftime("%Y-%m-%d %H:%M:%S")}}\n')
if not df.empty:
table = doc.add_table(rows=1, cols=len(df.columns))
hdr_cells = table.rows[0].cells
for i, column in enumerate(df.columns):
hdr_cells[i].text = column
for index, row in df.iterrows():
row_cells = table.add_row().cells
for i, column in enumerate(df.columns):
row_cells[i].text = str(row[column]) if row[column] else ""
output = io.BytesIO()
doc.save(output)
output.seek(0)
response = HttpResponse(output.read(), content_type='application/vnd.openxmlformats-officedocument.wordprocessingml.document')
response['Content-Disposition'] = f'attachment; filename="{{filename}}.docx"'
return response
return redirect('task_detail', task_id=task.id)