Fix: filtered payroll report inflates worker totals by N^2

Reported: when the generate-report page is filtered by BOTH project and
team, every amount in the "Worker Breakdown" and "Payments by Date"
tables blew up by ~100x. Example: Billy Baloyi R 5,400 (correct)
became R 604,800 (wrong, 112x) after selecting Wilkot + Civils One.

Root cause:
_build_report_context chained `records.filter(work_logs__project_id=X)
.distinct().filter(work_logs__team_id=Y).distinct()`. In Django's ORM
each chained M2M filter creates a SEPARATE JOIN alias on
core_payrollrecord_work_logs, so the SQL produces the cartesian product
of (matching-logs-for-project) x (matching-logs-for-team) rows per
PayrollRecord. A downstream `.values().annotate(Sum('amount_paid'))`
then summed across those duplicated rows - inflating every total by
N * M where N and M are the log counts per record.

Why total_paid_out looked correct: `.aggregate(Sum(...))` wraps the
query in a subquery when distinct() is in play, so it dedupes before
summing. `.values().annotate(Sum(...))` uses GROUP BY on the raw
joined rows and doesn't get that help.

Fix:
Replace chained M2M filters with id__in subquery filters:
  records.filter(id__in=PayrollRecord.objects.filter(
      work_logs__project_id=X).values('id'))
This keeps the outer queryset JOIN-free, so values().annotate(Sum())
aggregates over distinct records. Same pattern applied to the
adjustments team-filter (worker__teams M2M) for the adjustment
summary.

Tests: 5 new regression tests in ReportContextFilterInflationTests
covering project-only, team-only, both-filters, total_paid_out
invariant, and the adjustment summary path. All 24 tests pass
(19 existing + 5 new).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Konrad du Plessis 2026-04-22 19:51:07 +02:00
parent 6d37d1ba9b
commit f1e246ce24
2 changed files with 132 additions and 4 deletions

View File

@ -10,7 +10,7 @@ from django.test import TestCase
from django.urls import reverse
from core.models import Project, Team, Worker, WorkLog, PayrollRecord, PayrollAdjustment
from core.views import _build_work_log_payroll_context
from core.views import _build_work_log_payroll_context, _build_report_context
class WorkLogPayrollContextTests(TestCase):
@ -312,3 +312,107 @@ class WorkLogPayrollDetailTests(TestCase):
url = reverse('work_log_payroll_detail', args=[self.log.id])
resp = self.client.get(url)
self.assertEqual(resp.status_code, 403)
# =============================================================================
# === TESTS FOR PAYROLL REPORT FILTER INFLATION ===
# Regression tests for the M2M double-JOIN bug in _build_report_context.
#
# THE BUG (fixed 2026-04-23):
# Filtering a report by project AND team via chained `.filter(work_logs__field=X)`
# calls produced TWO separate JOIN aliases on the core_payrollrecord_work_logs
# M2M table. Any downstream `.values().annotate(Sum(...))` then aggregated
# across the cartesian product of matching rows, inflating every per-worker
# total_paid and every payments_by_date amount by N² (where N = number of
# matching work logs per record). Total_paid_out itself was correct because
# `.aggregate(Sum(...))` wraps distinct() in a subquery, but the per-row
# values/annotate pattern doesn't get that help.
#
# These tests lock down the fix: with filters applied, the worker-level and
# date-level totals must equal the real payment amount, not a multiplied one.
# =============================================================================
class ReportContextFilterInflationTests(TestCase):
"""Report aggregates must not inflate when project + team filters combine."""
def setUp(self):
self.admin = User.objects.create_user(username='admin-r', is_staff=True)
self.project = Project.objects.create(name='Solar Farm Gamma')
self.team = Team.objects.create(name='Team Gamma', supervisor=self.admin)
self.worker = Worker.objects.create(
name='Test Worker', id_number='TW1', monthly_salary=Decimal('4000')
)
# Worker must be in the team's M2M for the adjustment-summary test
# to find them via worker__teams__id=team_id. In real data this is
# the standard setup — workers belong to a team.
self.team.workers.add(self.worker)
# Three work logs in the range, all in the same project + team.
# This is the minimum setup that reproduces the N² inflation: with
# one payroll record linked to 3 logs, the double-JOIN produces 9 rows.
self.logs = []
for day in (5, 10, 15):
log = WorkLog.objects.create(
date=datetime.date(2026, 3, day),
project=self.project,
team=self.team,
supervisor=self.admin,
)
log.workers.add(self.worker)
self.logs.append(log)
# One payment covering all 3 logs.
self.record = PayrollRecord.objects.create(
worker=self.worker,
amount_paid=Decimal('600.00'),
date=datetime.date(2026, 3, 20),
)
self.record.work_logs.add(*self.logs)
def _ctx(self, project_id=None, team_id=None):
return _build_report_context(
datetime.date(2026, 3, 1),
datetime.date(2026, 3, 31),
project_id=project_id,
team_id=team_id,
)
def test_worker_breakdown_not_inflated_with_project_filter_only(self):
ctx = self._ctx(project_id=self.project.id)
self.assertEqual(len(ctx['worker_breakdown']), 1)
# Pre-fix: this was 600 × 3 = 1800 (one JOIN, 3-way inflation).
self.assertEqual(ctx['worker_breakdown'][0]['total_paid'], Decimal('600.00'))
def test_worker_breakdown_not_inflated_with_both_filters(self):
ctx = self._ctx(project_id=self.project.id, team_id=self.team.id)
self.assertEqual(len(ctx['worker_breakdown']), 1)
# Pre-fix: this was 600 × 3 × 3 = 5400 (two JOINs, 9-way inflation).
self.assertEqual(ctx['worker_breakdown'][0]['total_paid'], Decimal('600.00'))
def test_payments_by_date_not_inflated_with_both_filters(self):
ctx = self._ctx(project_id=self.project.id, team_id=self.team.id)
payments = list(ctx['payments_by_date'])
self.assertEqual(len(payments), 1)
self.assertEqual(payments[0]['total'], Decimal('600.00'))
def test_total_paid_out_stays_correct_with_both_filters(self):
"""Regression guard: total_paid_out was ALREADY correct pre-fix
because .aggregate() handles distinct() via a subquery. Lock it in
so a future refactor doesn't accidentally reintroduce inflation here."""
ctx = self._ctx(project_id=self.project.id, team_id=self.team.id)
self.assertEqual(ctx['total_paid_out'], Decimal('600.00'))
def test_adjustment_summary_not_inflated_with_team_filter(self):
"""Adjustments filtered by team go through worker__teams (M2M) — same
bug class. values().annotate(Sum()) would inflate if the worker is in
multiple teams or if the JOIN is chained with other M2M filters."""
PayrollAdjustment.objects.create(
worker=self.worker,
project=self.project,
type='Bonus',
amount=Decimal('100.00'),
date=datetime.date(2026, 3, 10),
description='Test bonus',
)
ctx = self._ctx(project_id=self.project.id, team_id=self.team.id)
totals = {item['type']: item['total'] for item in ctx['adjustment_totals']}
self.assertEqual(totals.get('Bonus'), Decimal('100.00'))

View File

@ -1878,11 +1878,30 @@ def _build_report_context(start_date, end_date, project_id=None, team_id=None):
date_filter = Q(date__gte=start_date, date__lte=end_date)
# --- PayrollRecords in range ---
#
# IMPORTANT — avoid M2M double-JOIN inflation:
# Chaining `.filter(work_logs__project_id=X).distinct().filter(work_logs__team_id=Y)`
# creates TWO separate JOIN aliases on core_payrollrecord_work_logs. Any
# later `.values().annotate(Sum())` then aggregates across the cartesian
# product of matching rows, inflating per-worker and per-date totals by
# N × M (where N and M are the counts of matching logs per record).
# `.aggregate(Sum())` is safe because Django wraps it in a subquery when
# distinct() is in play, but `.values().annotate(Sum())` isn't — so we
# use id__in subqueries to keep the outer queryset JOIN-free.
# See ReportContextFilterInflationTests for regression coverage.
records = PayrollRecord.objects.filter(date_filter)
if project_id:
records = records.filter(work_logs__project_id=project_id).distinct()
records = records.filter(
id__in=PayrollRecord.objects.filter(
work_logs__project_id=project_id
).values('id')
)
if team_id:
records = records.filter(work_logs__team_id=team_id).distinct()
records = records.filter(
id__in=PayrollRecord.objects.filter(
work_logs__team_id=team_id
).values('id')
)
# --- Total Paid Out (sum of all payments made) ---
total_paid_out = records.aggregate(total=Sum('amount_paid'))['total'] or Decimal('0.00')
@ -1895,11 +1914,16 @@ def _build_report_context(start_date, end_date, project_id=None, team_id=None):
)
# --- Adjustments in range ---
# project_id filters via an FK column (no JOIN inflation risk), but
# team_id goes through worker__teams M2M — apply the same subquery
# pattern as above to keep adj_by_type's values().annotate(Sum()) safe.
adjustments = PayrollAdjustment.objects.filter(date_filter)
if project_id:
adjustments = adjustments.filter(project_id=project_id)
if team_id:
adjustments = adjustments.filter(worker__teams__id=team_id).distinct()
adjustments = adjustments.filter(
worker__in=Worker.objects.filter(teams__id=team_id).values('id')
)
# --- Work Logs in range (for calculating actual labour cost) ---
work_logs_qs = WorkLog.objects.filter(date__gte=start_date, date__lte=end_date)