fix: worker batch report 'Total Paid' was inflated by the work-log join (+ kill its N+1s)

_build_worker_report_context computed Sum('payroll_records__amount_paid') in the SAME .annotate() as counts over work_logs and warnings. Django joins ALL those relations into one query, so each payroll row was duplicated once per work-log row — the lifetime Total Paid column on /workers/report/ (HTML, CSV and PDF) was multiplied by the worker's log count (3 logs + one R100 payslip displayed R300). The distinct=True counts were immune, which is why the suite never caught it. Verified by a controlled experiment before fixing; regression test added. Restructure: payroll aggregates stay in one annotate (same join — safe); days-worked, warnings and project names move to three batched GROUP BY dicts; teams + certificates are prefetched. Also removes the ~4 queries per worker the loop used to fire (audit findings #8). Display-only bug — payment processing was never affected. Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
2026-06-12 17:56:39 +02:00 · 2026-06-12 17:56:39 +02:00 · 25910b2861
commit 25910b2861
parent f0f3938621
2 changed files with 98 additions and 26 deletions
--- a/core/tests.py
+++ b/core/tests.py
@ -3901,3 +3901,38 @@ class WorkerReportBadParamsTests(TestCase):
    def test_csv_non_numeric_team_falls_back(self):
        resp = self.client.get(reverse('worker_batch_report_csv'), {'team': 'abc'})
        self.assertEqual(resp.status_code, 200)
+
+
+# =============================================================================
+# === AUDIT FINDING (Jun 2026) — WORKER REPORT LIFETIME TOTAL INFLATION ===
+# _build_worker_report_context computed Sum('payroll_records__amount_paid')
+# in the SAME .annotate() as counts over work_logs and warnings. Django
+# builds ONE query with all the joins, so each payroll record row is
+# duplicated once per work-log row — the lifetime "Total Paid" column was
+# multiplied by the worker's log count (3 logs + one R100 payslip showed
+# R300). The distinct=True counts were immune, which is why it hid.
+# =============================================================================
+
+class WorkerReportLifetimeTotalsTests(TestCase):
+    """total_paid_lifetime must not be multiplied by the work-log join."""
+
+    def test_total_paid_not_inflated_by_work_log_joins(self):
+        from core.views import _build_worker_report_context
+        admin = User.objects.create_user(username='infl_admin', is_staff=True)
+        p = Project.objects.create(name='Infl Project')
+        w = Worker.objects.create(
+            name='Infl Worker', id_number='INF-1',
+            monthly_salary=Decimal('4000'))
+        for d in (1, 2, 3):                       # 3 logs → 3 join rows
+            wl = WorkLog.objects.create(
+                date=datetime.date(2026, 6, d), project=p, supervisor=admin)
+            wl.workers.add(w)
+        PayrollRecord.objects.create(
+            worker=w, amount_paid=Decimal('100.00'),
+            date=datetime.date(2026, 6, 4))       # ONE payslip of R100
+
+        rows = _build_worker_report_context()
+        row = next(r for r in rows if r['worker'].id == w.id)
+        self.assertEqual(row['total_paid_lifetime'], Decimal('100.00'))
+        self.assertEqual(row['days_worked'], 3)
+        self.assertEqual(row['payslip_count'], 1)
--- a/core/views.py
+++ b/core/views.py
@ -1666,38 +1666,71 @@ def _build_worker_report_context(status=None, project_id=None, team_id=None):
    and warning counts. All aggregates are computed in a single query
    via annotate/prefetch to avoid N+1 database hits.
    """
-    workers = Worker.objects.all()
+    base = Worker.objects.all()
    if status == 'active':
-        workers = workers.filter(active=True)
+        base = base.filter(active=True)
    elif status == 'inactive':
-        workers = workers.filter(active=False)
+        base = base.filter(active=False)

    if project_id:
-        workers = workers.filter(work_logs__project_id=project_id).distinct()
+        base = base.filter(work_logs__project_id=project_id).distinct()
    if team_id:
-        workers = workers.filter(teams__id=team_id).distinct()
+        base = base.filter(teams__id=team_id).distinct()

-    workers = workers.annotate(
-        _days_worked=Count('work_logs__date', distinct=True),
+    # === AGGREGATION SAFETY (audit fix, Jun 2026) ===
+    # All four annotations below ride the SAME payroll_records join, so
+    # they can't multiply each other. The previous version also folded
+    # work_logs and warnings counts into this one annotate — Django then
+    # builds a single query joining ALL the relations, and every payroll
+    # row is duplicated once per work-log row. Result: the lifetime
+    # "Total Paid" Sum was multiplied by the worker's log count (the
+    # distinct=True counts were immune, which is why it went unnoticed).
+    # work_logs / warnings / projects are aggregated in their own batched
+    # GROUP BY queries below — same numbers, no shared-join cross product,
+    # and no per-worker queries inside the loop either.
+    workers = base.annotate(
        _first_payslip_date=Min('payroll_records__date'),
        _last_payslip_date=Max('payroll_records__date'),
        _total_paid_lifetime=Sum('payroll_records__amount_paid'),
        _payslip_count=Count('payroll_records', distinct=True),
-        _active_warnings=Count('warnings', distinct=True),
-    ).order_by('name')
+    ).prefetch_related('teams', 'certificates').order_by('name')
+
+    worker_id_qs = base.values('id')
+
+    # Days worked per worker (distinct dates across their work logs)
+    days_by_worker = dict(
+        WorkLog.objects.filter(workers__id__in=worker_id_qs)
+        .values('workers__id')
+        .annotate(days=Count('date', distinct=True))
+        .values_list('workers__id', 'days')
+    )
+    # Warning count per worker
+    warnings_by_worker = dict(
+        WorkerWarning.objects.filter(worker_id__in=worker_id_qs)
+        .values('worker_id')
+        .annotate(n=Count('id'))
+        .values_list('worker_id', 'n')
+    )
+    # Distinct project names each worker has appeared on
+    projects_by_worker = {}
+    for wid, pname in (
+        WorkLog.objects.filter(workers__id__in=worker_id_qs)
+        .values_list('workers__id', 'project__name')
+        .distinct()
+        .order_by('project__name')
+    ):
+        projects_by_worker.setdefault(wid, []).append(pname)

    today = timezone.localdate()
    thirty_days_out = today + datetime.timedelta(days=30)

    rows = []
    for w in workers:
-        projects = list(
-            Project.objects.filter(work_logs__workers=w).distinct().values_list('name', flat=True)
-        )
-        teams = list(w.teams.values_list('name', flat=True))
+        projects = projects_by_worker.get(w.id, [])
+        teams = [t.name for t in w.teams.all()]      # prefetched — no query

-        certs = w.certificates.all()
-        certs_total = certs.count()
+        certs = list(w.certificates.all())           # prefetched — no query
+        certs_total = len(certs)
        certs_active = 0
        certs_expiring = 0
        certs_expired = 0
@ -1716,7 +1749,7 @@ def _build_worker_report_context(status=None, project_id=None, team_id=None):
            'worker': w,
            'projects': projects,
            'teams': teams,
-            'days_worked': w._days_worked or 0,
+            'days_worked': days_by_worker.get(w.id, 0),
            'first_payslip_date': w._first_payslip_date,
            'last_payslip_date': w._last_payslip_date,
            'total_paid_lifetime': w._total_paid_lifetime or Decimal('0.00'),
@ -1725,7 +1758,7 @@ def _build_worker_report_context(status=None, project_id=None, team_id=None):
            'certs_active': certs_active,
            'certs_expiring': certs_expiring,
            'certs_expired': certs_expired,
-            'warnings_count': w._active_warnings or 0,
+            'warnings_count': warnings_by_worker.get(w.id, 0),
        })
    return rows

@ -2559,20 +2592,24 @@ def _build_report_context(start_date, end_date, project_ids=None, team_ids=None)
        .values_list('workers__id', 'days')
    )

+    # Adjustment totals per (worker, type) — ONE GROUP BY query instead of
+    # a per-worker-per-type .aggregate() inside the loop below (which fired
+    # workers × types separate queries, ~112 on a busy month — audit fix,
+    # Jun 2026). Same numbers, computed once.
+    adj_by_worker_type = {
+        (row['worker_id'], row['type']): row['total'] or Decimal('0.00')
+        for row in adjustments.values('worker_id', 'type').annotate(total=Sum('amount'))
+    }
+
    worker_breakdown = []
    for wr in worker_records:
-        w_adjs = adjustments.filter(worker_id=wr['worker__id'])
        # Per-type amounts for this worker (only for types that exist in the period).
        # Each `adj_values` entry is {'amount': Decimal, 'is_deductive': bool}
        # so the template can render "-R 500.00" for deductive types.
-        adj_values = []
-        for adj_type in active_adj_types:
-            amt = w_adjs.filter(type=adj_type).aggregate(
-                t=Sum('amount'))['t'] or Decimal('0.00')
-            adj_values.append({
-                'amount': amt,
-                'is_deductive': adj_type in DEDUCTIVE_TYPES,
-            })
+        adj_values = [{
+            'amount': adj_by_worker_type.get((wr['worker__id'], adj_type), Decimal('0.00')),
+            'is_deductive': adj_type in DEDUCTIVE_TYPES,
+        } for adj_type in active_adj_types]

        worker_breakdown.append({
            'name': wr['worker__name'],