Fix: filtered payroll report inflates worker totals by N^2

Reported: when the generate-report page is filtered by BOTH project and team, every amount in the "Worker Breakdown" and "Payments by Date" tables blew up by ~100x. Example: Billy Baloyi R 5,400 (correct) became R 604,800 (wrong, 112x) after selecting Wilkot + Civils One. Root cause: _build_report_context chained `records.filter(work_logs__project_id=X) .distinct().filter(work_logs__team_id=Y).distinct()`. In Django's ORM each chained M2M filter creates a SEPARATE JOIN alias on core_payrollrecord_work_logs, so the SQL produces the cartesian product of (matching-logs-for-project) x (matching-logs-for-team) rows per PayrollRecord. A downstream `.values().annotate(Sum('amount_paid'))` then summed across those duplicated rows - inflating every total by N * M where N and M are the log counts per record. Why total_paid_out looked correct: `.aggregate(Sum(...))` wraps the query in a subquery when distinct() is in play, so it dedupes before summing. `.values().annotate(Sum(...))` uses GROUP BY on the raw joined rows and doesn't get that help. Fix: Replace chained M2M filters with id__in subquery filters: records.filter(id__in=PayrollRecord.objects.filter( work_logs__project_id=X).values('id')) This keeps the outer queryset JOIN-free, so values().annotate(Sum()) aggregates over distinct records. Same pattern applied to the adjustments team-filter (worker__teams M2M) for the adjustment summary. Tests: 5 new regression tests in ReportContextFilterInflationTests covering project-only, team-only, both-filters, total_paid_out invariant, and the adjustment summary path. All 24 tests pass (19 existing + 5 new). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-22 19:51:07 +02:00 · 2026-04-22 19:51:07 +02:00 · f1e246ce24
commit f1e246ce24
parent 6d37d1ba9b
2 changed files with 132 additions and 4 deletions
--- a/core/tests.py
+++ b/core/tests.py
@ -10,7 +10,7 @@ from django.test import TestCase
 from django.urls import reverse
 from core.models import Project, Team, Worker, WorkLog, PayrollRecord, PayrollAdjustment
-from core.views import _build_work_log_payroll_context
+from core.views import _build_work_log_payroll_context, _build_report_context
 class WorkLogPayrollContextTests(TestCase):
@ -312,3 +312,107 @@ class WorkLogPayrollDetailTests(TestCase):
        url = reverse('work_log_payroll_detail', args=[self.log.id])
        resp = self.client.get(url)
        self.assertEqual(resp.status_code, 403)
 # =============================================================================
 # === TESTS FOR PAYROLL REPORT FILTER INFLATION ===
 # Regression tests for the M2M double-JOIN bug in _build_report_context.
 #
 # THE BUG (fixed 2026-04-23):
 # Filtering a report by project AND team via chained `.filter(work_logs__field=X)`
 # calls produced TWO separate JOIN aliases on the core_payrollrecord_work_logs
 # M2M table. Any downstream `.values().annotate(Sum(...))` then aggregated
 # across the cartesian product of matching rows, inflating every per-worker
 # total_paid and every payments_by_date amount by N² (where N = number of
 # matching work logs per record). Total_paid_out itself was correct because
 # `.aggregate(Sum(...))` wraps distinct() in a subquery, but the per-row
 # values/annotate pattern doesn't get that help.
 #
 # These tests lock down the fix: with filters applied, the worker-level and
 # date-level totals must equal the real payment amount, not a multiplied one.
 # =============================================================================
 class ReportContextFilterInflationTests(TestCase):
    """Report aggregates must not inflate when project + team filters combine."""
    def setUp(self):
        self.admin = User.objects.create_user(username='admin-r', is_staff=True)
        self.project = Project.objects.create(name='Solar Farm Gamma')
        self.team = Team.objects.create(name='Team Gamma', supervisor=self.admin)
        self.worker = Worker.objects.create(
            name='Test Worker', id_number='TW1', monthly_salary=Decimal('4000')
        )
        # Worker must be in the team's M2M for the adjustment-summary test
        # to find them via worker__teams__id=team_id. In real data this is
        # the standard setup — workers belong to a team.
        self.team.workers.add(self.worker)
        # Three work logs in the range, all in the same project + team.
        # This is the minimum setup that reproduces the N² inflation: with
        # one payroll record linked to 3 logs, the double-JOIN produces 9 rows.
        self.logs = []
        for day in (5, 10, 15):
            log = WorkLog.objects.create(
                date=datetime.date(2026, 3, day),
                project=self.project,
                team=self.team,
                supervisor=self.admin,
            )
            log.workers.add(self.worker)
            self.logs.append(log)
        # One payment covering all 3 logs.
        self.record = PayrollRecord.objects.create(
            worker=self.worker,
            amount_paid=Decimal('600.00'),
            date=datetime.date(2026, 3, 20),
        )
        self.record.work_logs.add(*self.logs)
    def _ctx(self, project_id=None, team_id=None):
        return _build_report_context(
            datetime.date(2026, 3, 1),
            datetime.date(2026, 3, 31),
            project_id=project_id,
            team_id=team_id,
        )
    def test_worker_breakdown_not_inflated_with_project_filter_only(self):
        ctx = self._ctx(project_id=self.project.id)
        self.assertEqual(len(ctx['worker_breakdown']), 1)
        # Pre-fix: this was 600 × 3 = 1800 (one JOIN, 3-way inflation).
        self.assertEqual(ctx['worker_breakdown'][0]['total_paid'], Decimal('600.00'))
    def test_worker_breakdown_not_inflated_with_both_filters(self):
        ctx = self._ctx(project_id=self.project.id, team_id=self.team.id)
        self.assertEqual(len(ctx['worker_breakdown']), 1)
        # Pre-fix: this was 600 × 3 × 3 = 5400 (two JOINs, 9-way inflation).
        self.assertEqual(ctx['worker_breakdown'][0]['total_paid'], Decimal('600.00'))
    def test_payments_by_date_not_inflated_with_both_filters(self):
        ctx = self._ctx(project_id=self.project.id, team_id=self.team.id)
        payments = list(ctx['payments_by_date'])
        self.assertEqual(len(payments), 1)
        self.assertEqual(payments[0]['total'], Decimal('600.00'))
    def test_total_paid_out_stays_correct_with_both_filters(self):
        """Regression guard: total_paid_out was ALREADY correct pre-fix
        because .aggregate() handles distinct() via a subquery. Lock it in
        so a future refactor doesn't accidentally reintroduce inflation here."""
        ctx = self._ctx(project_id=self.project.id, team_id=self.team.id)
        self.assertEqual(ctx['total_paid_out'], Decimal('600.00'))
    def test_adjustment_summary_not_inflated_with_team_filter(self):
        """Adjustments filtered by team go through worker__teams (M2M) — same
        bug class. values().annotate(Sum()) would inflate if the worker is in
        multiple teams or if the JOIN is chained with other M2M filters."""
        PayrollAdjustment.objects.create(
            worker=self.worker,
            project=self.project,
            type='Bonus',
            amount=Decimal('100.00'),
            date=datetime.date(2026, 3, 10),
            description='Test bonus',
        )
        ctx = self._ctx(project_id=self.project.id, team_id=self.team.id)
        totals = {item['type']: item['total'] for item in ctx['adjustment_totals']}
        self.assertEqual(totals.get('Bonus'), Decimal('100.00'))
--- a/core/views.py
+++ b/core/views.py
@ -1878,11 +1878,30 @@ def _build_report_context(start_date, end_date, project_id=None, team_id=None):
    date_filter = Q(date__gte=start_date, date__lte=end_date)
    # --- PayrollRecords in range ---
    #
    # IMPORTANT — avoid M2M double-JOIN inflation:
    # Chaining `.filter(work_logs__project_id=X).distinct().filter(work_logs__team_id=Y)`
    # creates TWO separate JOIN aliases on core_payrollrecord_work_logs. Any
    # later `.values().annotate(Sum())` then aggregates across the cartesian
    # product of matching rows, inflating per-worker and per-date totals by
    # N × M (where N and M are the counts of matching logs per record).
    # `.aggregate(Sum())` is safe because Django wraps it in a subquery when
    # distinct() is in play, but `.values().annotate(Sum())` isn't — so we
    # use id__in subqueries to keep the outer queryset JOIN-free.
    # See ReportContextFilterInflationTests for regression coverage.
    records = PayrollRecord.objects.filter(date_filter)
    if project_id:
-        records = records.filter(work_logs__project_id=project_id).distinct()
+        records = records.filter(
            id__in=PayrollRecord.objects.filter(
                work_logs__project_id=project_id
            ).values('id')
        )
    if team_id:
-        records = records.filter(work_logs__team_id=team_id).distinct()
+        records = records.filter(
            id__in=PayrollRecord.objects.filter(
                work_logs__team_id=team_id
            ).values('id')
        )
    # --- Total Paid Out (sum of all payments made) ---
    total_paid_out = records.aggregate(total=Sum('amount_paid'))['total'] or Decimal('0.00')
@ -1895,11 +1914,16 @@ def _build_report_context(start_date, end_date, project_id=None, team_id=None):
    )
    # --- Adjustments in range ---
    # project_id filters via an FK column (no JOIN inflation risk), but
    # team_id goes through worker__teams M2M — apply the same subquery
    # pattern as above to keep adj_by_type's values().annotate(Sum()) safe.
    adjustments = PayrollAdjustment.objects.filter(date_filter)
    if project_id:
        adjustments = adjustments.filter(project_id=project_id)
    if team_id:
-        adjustments = adjustments.filter(worker__teams__id=team_id).distinct()
+        adjustments = adjustments.filter(
            worker__in=Worker.objects.filter(teams__id=team_id).values('id')
        )
    # --- Work Logs in range (for calculating actual labour cost) ---
    work_logs_qs = WorkLog.objects.filter(date__gte=start_date, date__lte=end_date)