from html.parser import HTMLParser from io import BytesIO import textwrap from django.contrib import messages from django.http import HttpResponse from django.shortcuts import get_object_or_404, redirect, render from django.utils import timezone from django.utils.text import slugify from reportlab.lib.pagesizes import letter from reportlab.pdfgen import canvas from .forms import BundleUploadForm from .models import HtmlBundle, HtmlDocument, HtmlExport class _HtmlTextExtractor(HTMLParser): block_tags = { "p", "div", "br", "li", "h1", "h2", "h3", "h4", "h5", "h6", "section", "article", "header", "footer", } def __init__(self): super().__init__() self.parts = [] def handle_starttag(self, tag, attrs): if tag in self.block_tags: self.parts.append("\n") def handle_endtag(self, tag): if tag in self.block_tags: self.parts.append("\n") def handle_data(self, data): if data.strip(): self.parts.append(data) def _extract_text_from_html(html_bytes: bytes) -> str: html_text = html_bytes.decode("utf-8", errors="replace") parser = _HtmlTextExtractor() parser.feed(html_text) raw_text = "".join(parser.parts) lines = [line.strip() for line in raw_text.splitlines()] cleaned = "\n".join([line for line in lines if line]) return cleaned def _build_pdf(bundle: HtmlBundle) -> BytesIO: buffer = BytesIO() pdf = canvas.Canvas(buffer, pagesize=letter) width, height = letter margin = 72 line_height = 14 y = height - margin documents = bundle.documents.all() for document in documents: text = document.content_text or "" paragraphs = text.splitlines() or [""] for paragraph in paragraphs: wrapped = textwrap.wrap(paragraph, width=95) or [""] for line in wrapped: if y < margin: pdf.showPage() y = height - margin pdf.setFont("Helvetica", 11) pdf.drawString(margin, y, line) y -= line_height y -= 6 y -= 12 pdf.save() buffer.seek(0) return buffer def home(request): file_errors = [] form = BundleUploadForm() if request.method == "POST": form = BundleUploadForm(request.POST) files = request.FILES.getlist("files") if form.is_valid(): if not files: file_errors.append("No files were received. Please ensure you have selected files and try again.") else: invalid_files = [f.name for f in files if not f.name.lower().endswith((".html", ".htm"))] if invalid_files: file_errors.append( f"Only .html or .htm files are supported. Invalid: {', '.join(invalid_files[:3])}" ) else: title = form.cleaned_data.get("title", "").strip() bundle = HtmlBundle.objects.create(title=title) for index, uploaded in enumerate(files, start=1): content_text = _extract_text_from_html(uploaded.read()) HtmlDocument.objects.create( bundle=bundle, original_name=uploaded.name, order=index, content_text=content_text, ) messages.success(request, f"Bundle '{bundle.title or 'Untitled'}' created with {len(files)} files.") return redirect("bundle_detail", bundle_id=bundle.id) bundles = HtmlBundle.objects.order_by("-created_at")[:5] exports = HtmlExport.objects.select_related("bundle").order_by("-created_at")[:5] context = { "page_title": "HTML Bundle to PDF", "page_description": "Upload multiple HTML files, arrange the order, and export a single PDF instantly.", "form": form, "file_errors": file_errors, "bundles": bundles, "exports": exports, } return render(request, "core/index.html", context) def bundle_list(request): bundles = HtmlBundle.objects.order_by("-created_at") context = { "page_title": "All Bundles", "page_description": "Browse recent HTML bundles and download combined PDFs.", "bundles": bundles, } return render(request, "core/bundle_list.html", context) def bundle_detail(request, bundle_id: int): bundle = get_object_or_404(HtmlBundle, pk=bundle_id) documents = bundle.documents.all() exports = bundle.exports.order_by("-created_at") if request.method == "POST" and request.POST.get("action") == "update_order": updates = [] for document in documents: field_name = f"order_{document.id}" raw_value = request.POST.get(field_name) if raw_value is None: continue try: order_value = int(raw_value) if order_value < 1: raise ValueError except ValueError: messages.error(request, "Order values must be positive numbers.") return redirect("bundle_detail", bundle_id=bundle.id) document.order = order_value updates.append(document) if updates: HtmlDocument.objects.bulk_update(updates, ["order"]) messages.success(request, "Order updated.") return redirect("bundle_detail", bundle_id=bundle.id) context = { "page_title": bundle.title or "Untitled bundle", "page_description": "Review your bundle and generate a combined PDF.", "bundle": bundle, "documents": documents, "exports": exports, } return render(request, "core/bundle_detail.html", context) def bundle_download(request, bundle_id: int): bundle = get_object_or_404(HtmlBundle, pk=bundle_id) slug = slugify(bundle.title) or "bundle" timestamp = timezone.now().strftime("%Y%m%d-%H%M") file_name = f"{slug}-{timestamp}.pdf" HtmlExport.objects.create(bundle=bundle, file_name=file_name) pdf_buffer = _build_pdf(bundle) response = HttpResponse(pdf_buffer.getvalue(), content_type="application/pdf") response["Content-Disposition"] = f'attachment; filename="{file_name}"' return response def export_download(request, export_id: int): export = get_object_or_404(HtmlExport, pk=export_id) pdf_buffer = _build_pdf(export.bundle) response = HttpResponse(pdf_buffer.getvalue(), content_type="application/pdf") response["Content-Disposition"] = f'attachment; filename="{export.file_name}"' return response