From bd3522d28ae9e4616cc78b8da10e2d1d23554c8f Mon Sep 17 00:00:00 2001 From: muena Date: Wed, 20 May 2026 13:29:35 +0200 Subject: [PATCH] fix(velum): rebalance brackets after pseudonymization MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Velum's entity detector occasionally pulls a leading ( into a PHONE span without taking the matching ) — so "Berater (+43 660 938 4021)" turned into "Berater )", with the opening paren swallowed by the placeholder. After normalizing the response, walk each mapping entry and peel off any unbalanced opening or closing bracket from the original value, moving it outside the placeholder in the pseudonymized text. Works for () [] {}. Balanced brackets inside the original (e.g. a "(660)" area code) are left alone. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/velum.ts | 66 ++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 64 insertions(+), 2 deletions(-) diff --git a/src/velum.ts b/src/velum.ts index d8b8681..a2cd096 100644 --- a/src/velum.ts +++ b/src/velum.ts @@ -11,6 +11,66 @@ function escapeRegExp(value: string): string { return value.replace(/[.*+?^${}()|[\]\\]/g, "\\$&"); } +const BRACKET_PAIRS: Array<{ open: string; close: string }> = [ + { open: "(", close: ")" }, + { open: "[", close: "]" }, + { open: "{", close: "}" }, +]; + +function countChar(value: string, char: string): number { + let count = 0; + for (const c of value) if (c === char) count += 1; + return count; +} + +function rebalanceBrackets( + text: string, + mapping: PlaceholderMapping, +): { text: string; mapping: PlaceholderMapping } { + let resultText = text; + const resultMapping: PlaceholderMapping = {}; + + for (const [placeholder, entry] of Object.entries(mapping)) { + let original = entry.original; + let lead = ""; + let trail = ""; + + let trimmed = true; + while (trimmed && original.length > 0) { + trimmed = false; + const pair = BRACKET_PAIRS.find((p) => original.startsWith(p.open)); + if (pair && countChar(original, pair.open) > countChar(original, pair.close)) { + lead += original[0]; + original = original.slice(1); + trimmed = true; + } + } + + trimmed = true; + while (trimmed && original.length > 0) { + trimmed = false; + const pair = BRACKET_PAIRS.find((p) => original.endsWith(p.close)); + if (pair && countChar(original, pair.close) > countChar(original, pair.open)) { + trail = original[original.length - 1] + trail; + original = original.slice(0, -1); + trimmed = true; + } + } + + if (lead || trail) { + const escaped = escapeRegExp(placeholder); + resultText = resultText.replace( + new RegExp(escaped, "g"), + `${lead}${placeholder}${trail}`, + ); + } + + resultMapping[placeholder] = { ...entry, original }; + } + + return { text: resultText, mapping: resultMapping }; +} + export function normalizePseudonymizeResponse( response: PseudonymizeResponse, ): PseudonymizeResponse { @@ -34,10 +94,12 @@ export function normalizePseudonymizeResponse( text = text.replace(bareRegex, bracketed); } + const rebalanced = rebalanceBrackets(text, normalizedMapping); + return { ...response, - pseudonymized_text: text, - mapping: normalizedMapping, + pseudonymized_text: rebalanced.text, + mapping: rebalanced.mapping, }; }