From bf76b0f233d2f599ba04f3ce40214dac629e133a Mon Sep 17 00:00:00 2001 From: Cameron Dawson Date: Mon, 25 May 2026 10:30:02 -0700 Subject: [PATCH 1/2] Reduce over-fetching in Bugscache queries Bugscache.search() and the create_bug endpoint loaded full Bugscache rows when only a subset of columns is actually used: - search(): defer `modified` and `processed_update` via .only(), since serialize() excludes them anyway. Remove the dead `jobmap` entry from the serialize() exclude list (a reverse relation model_to_dict never iterates). - create_bug: resolve the existing bug id with values_list() instead of fetching the whole row, restrict the summary fallback query with .only(), and limit the linking save() to the two changed columns via update_fields. Add tests covering the create_bug internal-id resolution paths. --- tests/webapp/api/test_bugzilla.py | 86 +++++++++++++++++++++++++++++++ treeherder/model/models.py | 15 +++++- treeherder/webapp/api/bugzilla.py | 16 ++++-- 3 files changed, 111 insertions(+), 6 deletions(-) diff --git a/tests/webapp/api/test_bugzilla.py b/tests/webapp/api/test_bugzilla.py index 7f6d09f5508..00a1d39de6e 100644 --- a/tests/webapp/api/test_bugzilla.py +++ b/tests/webapp/api/test_bugzilla.py @@ -2,6 +2,9 @@ import responses from django.urls import reverse +from django.utils import timezone + +from treeherder.model.models import Bugscache def test_create_bug(client, eleven_jobs_stored, activate_responses, test_user): @@ -264,3 +267,86 @@ def request_callback(request): ) assert resp.status_code == 400 assert resp.json()["failure"] == "Crash signature can't be more than 2048 characters." + + +def _make_bug_response(bug_id=323): + def request_callback(request): + return (200, {}, json.dumps({"id": bug_id})) + + responses.add_callback( + responses.POST, + "https://thisisnotbugzilla.org/rest/bug", + callback=request_callback, + content_type="application/json", + ) + + +def test_create_bug_returns_existing_internal_id( + client, eleven_jobs_stored, activate_responses, test_user +): + """When a Bugscache row already exists for the new bugzilla id, its pk is returned.""" + _make_bug_response(bug_id=323) + existing = Bugscache.objects.create( + bugzilla_id=323, + status="NEW", + resolution="", + summary="some pre-existing summary", + crash_signature="", + keywords="", + modified=timezone.now(), + ) + + client.force_authenticate(user=test_user) + resp = client.post( + reverse("bugzilla-create-bug"), + { + "type": "defect", + "product": "Bugzilla", + "component": "Administration", + "summary": "Intermittent summary", + "version": "4.0.17", + "comment": "Intermittent Description", + "comment_tags": "treeherder", + "keywords": ["intermittent-failure"], + "is_security_issue": False, + }, + ) + assert resp.status_code == 200 + assert resp.json()["internal_id"] == existing.id + + +def test_create_bug_links_internal_bug_by_summary( + client, eleven_jobs_stored, activate_responses, test_user +): + """An internal bug matching by summary gets the new bugzilla id and its pk is returned.""" + _make_bug_response(bug_id=323) + existing = Bugscache.objects.create( + bugzilla_id=None, + status="NEW", + resolution="", + summary="Intermittent summary", + crash_signature="", + keywords="", + modified=timezone.now(), + ) + + client.force_authenticate(user=test_user) + resp = client.post( + reverse("bugzilla-create-bug"), + { + "type": "defect", + "product": "Bugzilla", + "component": "Administration", + "summary": "Intermittent summary", + "version": "4.0.17", + "comment": "Intermittent Description", + "comment_tags": "treeherder", + "keywords": ["intermittent-failure"], + "is_security_issue": False, + }, + ) + assert resp.status_code == 200 + assert resp.json()["internal_id"] == existing.id + + existing.refresh_from_db() + assert existing.bugzilla_id == 323 diff --git a/treeherder/model/models.py b/treeherder/model/models.py index a36d38201be..482bd0c2cf3 100644 --- a/treeherder/model/models.py +++ b/treeherder/model/models.py @@ -305,7 +305,7 @@ def __str__(self): return f"{self.id}" def serialize(self): - exclude_fields = ["modified", "processed_update", "jobmap"] + exclude_fields = ["modified", "processed_update"] attrs = model_to_dict(self, exclude=exclude_fields) # Serialize bug ID as the bugzilla number for compatibility reasons @@ -333,6 +333,19 @@ def search(cls, search_term): # Django already escapes special characters, so we do not need to handle that here recent_qs = ( Bugscache.objects.filter(summary__icontains=search_term) + # Only fetch the fields that serialize() returns; modified and + # processed_update are excluded there, so there's no need to load them. + .only( + "id", + "bugzilla_id", + "status", + "resolution", + "summary", + "dupe_of", + "crash_signature", + "keywords", + "whiteboard", + ) .annotate(similarity=TrigramSimilarity("summary", search_term)) .order_by("-similarity")[0:max_size] ) diff --git a/treeherder/webapp/api/bugzilla.py b/treeherder/webapp/api/bugzilla.py index d4e10015ef0..be09c688c40 100644 --- a/treeherder/webapp/api/bugzilla.py +++ b/treeherder/webapp/api/bugzilla.py @@ -105,15 +105,21 @@ def create_bug(self, request): bug_id = response.json()["id"] summary = summary.decode("utf-8") # Creation API only returns the ID, but the bug will be updated later on by `treeherder.etl.bugzilla.BzApiBugProcess` - bug = Bugscache.objects.filter(bugzilla_id=bug_id).first() - if not bug: - bugs = list(Bugscache.objects.filter(summary=summary).order_by("modified")) + internal_id = ( + Bugscache.objects.filter(bugzilla_id=bug_id).values_list("id", flat=True).first() + ) + if internal_id is None: + bugs = list( + Bugscache.objects.filter(summary=summary) + .only("id", "bugzilla_id", "modified") + .order_by("modified") + ) if bugs and not (bug := next((b.bugzilla_id == bug_id for b in bugs), None)): bug = bugs[-1] bug.modified = timezone.now() bug.bugzilla_id = bug_id - bug.save() - internal_id = bug.id if bug else None + bug.save(update_fields=["bugzilla_id", "modified"]) + internal_id = bug.id return Response( { "id": bug_id, From 79687d7b67f2bf1f434e62703a052bd25165bad7 Mon Sep 17 00:00:00 2001 From: Cameron Dawson Date: Mon, 25 May 2026 10:31:52 -0700 Subject: [PATCH 2/2] Add GIN trigram index on bugscache.summary Bugscache.search() filters with `summary ILIKE '%term%'` and orders by trigram similarity. The existing btree index on summary cannot serve substring or similarity matching, so every search does a sequential scan of the bugscache table. Add a GIN trigram index (gin_trgm_ops) on summary so those searches can use an index scan. Built CONCURRENTLY (non-atomic migration) to avoid a write lock on the table. The pg_trgm extension is already enabled (migration 0031). --- .../0051_bugscache_summary_gin_trgm_index.py | 26 +++++++++++++++++++ treeherder/model/models.py | 4 ++- 2 files changed, 29 insertions(+), 1 deletion(-) create mode 100644 treeherder/model/migrations/0051_bugscache_summary_gin_trgm_index.py diff --git a/treeherder/model/migrations/0051_bugscache_summary_gin_trgm_index.py b/treeherder/model/migrations/0051_bugscache_summary_gin_trgm_index.py new file mode 100644 index 00000000000..2d0f1df3885 --- /dev/null +++ b/treeherder/model/migrations/0051_bugscache_summary_gin_trgm_index.py @@ -0,0 +1,26 @@ +from django.db import migrations + + +class Migration(migrations.Migration): + # Non-atomic so the index can be built CONCURRENTLY, avoiding a write lock + # on the bugscache table while the index is created. + atomic = False + + dependencies = [ + ("model", "0050_repository_accepts_pull_requests"), + ] + + operations = [ + # GIN trigram index backing Bugscache.search(), which filters with + # `summary ILIKE '%term%'` and orders by trigram similarity. The plain + # btree index on summary cannot serve substring/similarity matching, so + # this index is what lets those searches use an index scan instead of a + # sequential scan. Requires the pg_trgm extension (added in 0031). + migrations.RunSQL( + sql=( + "CREATE INDEX CONCURRENTLY IF NOT EXISTS bugscache_summary_gin_trgm_idx " + "ON bugscache USING gin (summary gin_trgm_ops);" + ), + reverse_sql="DROP INDEX IF EXISTS bugscache_summary_gin_trgm_idx;", + ), + ] diff --git a/treeherder/model/models.py b/treeherder/model/models.py index 482bd0c2cf3..c0af5b4c7fb 100644 --- a/treeherder/model/models.py +++ b/treeherder/model/models.py @@ -278,7 +278,9 @@ class Bugscache(models.Model): status = models.CharField(max_length=64, db_index=True) resolution = models.CharField(max_length=64, blank=True, db_index=True) - # Is covered by a FULLTEXT index created via a migrations RunSQL operation. + # Backed by a GIN trigram index (bugscache_summary_gin_trgm_idx, created via + # a RunSQL migration) so the ILIKE + trigram similarity search in search() + # can use an index scan. The btree Index below only covers exact lookups. summary = models.CharField(max_length=255) dupe_of = models.PositiveIntegerField(null=True) crash_signature = models.TextField(blank=True)