fix(velum): rebalance brackets after pseudonymization

Velum's entity detector occasionally pulls a leading ( into a PHONE
span without taking the matching ) — so "Berater (+43 660 938 4021)"
turned into "Berater <PHONE_2>)", with the opening paren swallowed by
the placeholder.

After normalizing the response, walk each mapping entry and peel off
any unbalanced opening or closing bracket from the original value,
moving it outside the placeholder in the pseudonymized text. Works for
() [] {}. Balanced brackets inside the original (e.g. a "(660)" area
code) are left alone.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
muena
2026-05-20 13:29:35 +02:00
parent 7f3f26534c
commit bd3522d28a

View File

@@ -11,6 +11,66 @@ function escapeRegExp(value: string): string {
return value.replace(/[.*+?^${}()|[\]\\]/g, "\\$&"); return value.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
} }
const BRACKET_PAIRS: Array<{ open: string; close: string }> = [
{ open: "(", close: ")" },
{ open: "[", close: "]" },
{ open: "{", close: "}" },
];
function countChar(value: string, char: string): number {
let count = 0;
for (const c of value) if (c === char) count += 1;
return count;
}
function rebalanceBrackets(
text: string,
mapping: PlaceholderMapping,
): { text: string; mapping: PlaceholderMapping } {
let resultText = text;
const resultMapping: PlaceholderMapping = {};
for (const [placeholder, entry] of Object.entries(mapping)) {
let original = entry.original;
let lead = "";
let trail = "";
let trimmed = true;
while (trimmed && original.length > 0) {
trimmed = false;
const pair = BRACKET_PAIRS.find((p) => original.startsWith(p.open));
if (pair && countChar(original, pair.open) > countChar(original, pair.close)) {
lead += original[0];
original = original.slice(1);
trimmed = true;
}
}
trimmed = true;
while (trimmed && original.length > 0) {
trimmed = false;
const pair = BRACKET_PAIRS.find((p) => original.endsWith(p.close));
if (pair && countChar(original, pair.close) > countChar(original, pair.open)) {
trail = original[original.length - 1] + trail;
original = original.slice(0, -1);
trimmed = true;
}
}
if (lead || trail) {
const escaped = escapeRegExp(placeholder);
resultText = resultText.replace(
new RegExp(escaped, "g"),
`${lead}${placeholder}${trail}`,
);
}
resultMapping[placeholder] = { ...entry, original };
}
return { text: resultText, mapping: resultMapping };
}
export function normalizePseudonymizeResponse( export function normalizePseudonymizeResponse(
response: PseudonymizeResponse, response: PseudonymizeResponse,
): PseudonymizeResponse { ): PseudonymizeResponse {
@@ -34,10 +94,12 @@ export function normalizePseudonymizeResponse(
text = text.replace(bareRegex, bracketed); text = text.replace(bareRegex, bracketed);
} }
const rebalanced = rebalanceBrackets(text, normalizedMapping);
return { return {
...response, ...response,
pseudonymized_text: text, pseudonymized_text: rebalanced.text,
mapping: normalizedMapping, mapping: rebalanced.mapping,
}; };
} }