fix(velum): rebalance brackets after pseudonymization
Velum's entity detector occasionally pulls a leading ( into a PHONE
span without taking the matching ) — so "Berater (+43 660 938 4021)"
turned into "Berater <PHONE_2>)", with the opening paren swallowed by
the placeholder.
After normalizing the response, walk each mapping entry and peel off
any unbalanced opening or closing bracket from the original value,
moving it outside the placeholder in the pseudonymized text. Works for
() [] {}. Balanced brackets inside the original (e.g. a "(660)" area
code) are left alone.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
66
src/velum.ts
66
src/velum.ts
@@ -11,6 +11,66 @@ function escapeRegExp(value: string): string {
|
||||
return value.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
|
||||
}
|
||||
|
||||
const BRACKET_PAIRS: Array<{ open: string; close: string }> = [
|
||||
{ open: "(", close: ")" },
|
||||
{ open: "[", close: "]" },
|
||||
{ open: "{", close: "}" },
|
||||
];
|
||||
|
||||
function countChar(value: string, char: string): number {
|
||||
let count = 0;
|
||||
for (const c of value) if (c === char) count += 1;
|
||||
return count;
|
||||
}
|
||||
|
||||
function rebalanceBrackets(
|
||||
text: string,
|
||||
mapping: PlaceholderMapping,
|
||||
): { text: string; mapping: PlaceholderMapping } {
|
||||
let resultText = text;
|
||||
const resultMapping: PlaceholderMapping = {};
|
||||
|
||||
for (const [placeholder, entry] of Object.entries(mapping)) {
|
||||
let original = entry.original;
|
||||
let lead = "";
|
||||
let trail = "";
|
||||
|
||||
let trimmed = true;
|
||||
while (trimmed && original.length > 0) {
|
||||
trimmed = false;
|
||||
const pair = BRACKET_PAIRS.find((p) => original.startsWith(p.open));
|
||||
if (pair && countChar(original, pair.open) > countChar(original, pair.close)) {
|
||||
lead += original[0];
|
||||
original = original.slice(1);
|
||||
trimmed = true;
|
||||
}
|
||||
}
|
||||
|
||||
trimmed = true;
|
||||
while (trimmed && original.length > 0) {
|
||||
trimmed = false;
|
||||
const pair = BRACKET_PAIRS.find((p) => original.endsWith(p.close));
|
||||
if (pair && countChar(original, pair.close) > countChar(original, pair.open)) {
|
||||
trail = original[original.length - 1] + trail;
|
||||
original = original.slice(0, -1);
|
||||
trimmed = true;
|
||||
}
|
||||
}
|
||||
|
||||
if (lead || trail) {
|
||||
const escaped = escapeRegExp(placeholder);
|
||||
resultText = resultText.replace(
|
||||
new RegExp(escaped, "g"),
|
||||
`${lead}${placeholder}${trail}`,
|
||||
);
|
||||
}
|
||||
|
||||
resultMapping[placeholder] = { ...entry, original };
|
||||
}
|
||||
|
||||
return { text: resultText, mapping: resultMapping };
|
||||
}
|
||||
|
||||
export function normalizePseudonymizeResponse(
|
||||
response: PseudonymizeResponse,
|
||||
): PseudonymizeResponse {
|
||||
@@ -34,10 +94,12 @@ export function normalizePseudonymizeResponse(
|
||||
text = text.replace(bareRegex, bracketed);
|
||||
}
|
||||
|
||||
const rebalanced = rebalanceBrackets(text, normalizedMapping);
|
||||
|
||||
return {
|
||||
...response,
|
||||
pseudonymized_text: text,
|
||||
mapping: normalizedMapping,
|
||||
pseudonymized_text: rebalanced.text,
|
||||
mapping: rebalanced.mapping,
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user