From 61cd297c181a37a0cc9e5f5d3e424b92bdf6aeaa Mon Sep 17 00:00:00 2001 From: Lina Tawfik Date: Wed, 28 May 2025 17:29:09 -0700 Subject: [PATCH] Add enhanced text sanitization --- rendered.html | 1 + src/create-prompt/index.ts | 6 +- src/github/data/formatter.ts | 33 +++- src/github/utils/sanitizer.ts | 64 ++++++ test-markdown.json | 6 + test/data-formatter.test.ts | 176 ++--------------- test/integration-sanitization.test.ts | 156 +++++++++++++++ test/sanitizer.test.ts | 274 ++++++++++++++++++++++++++ 8 files changed, 541 insertions(+), 175 deletions(-) create mode 100644 rendered.html create mode 100644 src/github/utils/sanitizer.ts create mode 100644 test-markdown.json create mode 100644 test/integration-sanitization.test.ts create mode 100644 test/sanitizer.test.ts diff --git a/rendered.html b/rendered.html new file mode 100644 index 0000000..93d8125 --- /dev/null +++ b/rendered.html @@ -0,0 +1 @@ +{"message":"Problems parsing JSON","documentation_url":"https://docs.github.com/rest/markdown/markdown#render-a-markdown-document","status":"400"} \ No newline at end of file diff --git a/src/create-prompt/index.ts b/src/create-prompt/index.ts index f622dec..885590d 100644 --- a/src/create-prompt/index.ts +++ b/src/create-prompt/index.ts @@ -9,8 +9,8 @@ import { formatComments, formatReviewComments, formatChangedFilesWithSHA, - stripHtmlComments, } from "../github/data/formatter"; +import { sanitizeContent } from "../github/utils/sanitizer"; import { isIssuesEvent, isIssueCommentEvent, @@ -419,14 +419,14 @@ ${ eventData.eventName === "pull_request_review") && eventData.commentBody ? ` -${stripHtmlComments(eventData.commentBody)} +${sanitizeContent(eventData.commentBody)} ` : "" } ${ context.directPrompt ? ` -${stripHtmlComments(context.directPrompt)} +${sanitizeContent(context.directPrompt)} ` : "" } diff --git a/src/github/data/formatter.ts b/src/github/data/formatter.ts index df7b485..3ecc579 100644 --- a/src/github/data/formatter.ts +++ b/src/github/data/formatter.ts @@ -6,10 +6,7 @@ import type { GitHubReview, } from "../types"; import type { GitHubFileWithSHA } from "./fetcher"; - -export function stripHtmlComments(text: string): string { - return text.replace(//g, ""); -} +import { sanitizeContent } from "../utils/sanitizer"; export function formatContext( contextData: GitHubPullRequest | GitHubIssue, @@ -37,13 +34,14 @@ export function formatBody( body: string, imageUrlMap: Map, ): string { - let processedBody = stripHtmlComments(body); + let processedBody = body; - // Replace image URLs with local paths for (const [originalUrl, localPath] of imageUrlMap) { processedBody = processedBody.replaceAll(originalUrl, localPath); } + processedBody = sanitizeContent(processedBody); + return processedBody; } @@ -53,15 +51,16 @@ export function formatComments( ): string { return comments .map((comment) => { - let body = stripHtmlComments(comment.body); + let body = comment.body; - // Replace image URLs with local paths if we have a mapping if (imageUrlMap && body) { for (const [originalUrl, localPath] of imageUrlMap) { body = body.replaceAll(originalUrl, localPath); } } + body = sanitizeContent(body); + return `[${comment.author.login} at ${comment.createdAt}]: ${body}`; }) .join("\n\n"); @@ -78,6 +77,19 @@ export function formatReviewComments( const formattedReviews = reviewData.nodes.map((review) => { let reviewOutput = `[Review by ${review.author.login} at ${review.submittedAt}]: ${review.state}`; + if (review.body && review.body.trim()) { + let body = review.body; + + if (imageUrlMap) { + for (const [originalUrl, localPath] of imageUrlMap) { + body = body.replaceAll(originalUrl, localPath); + } + } + + const sanitizedBody = sanitizeContent(body); + reviewOutput += `\n${sanitizedBody}`; + } + if ( review.comments && review.comments.nodes && @@ -85,15 +97,16 @@ export function formatReviewComments( ) { const comments = review.comments.nodes .map((comment) => { - let body = stripHtmlComments(comment.body); + let body = comment.body; - // Replace image URLs with local paths if we have a mapping if (imageUrlMap) { for (const [originalUrl, localPath] of imageUrlMap) { body = body.replaceAll(originalUrl, localPath); } } + body = sanitizeContent(body); + return ` [Comment on ${comment.path}:${comment.line || "?"}]: ${body}`; }) .join("\n"); diff --git a/src/github/utils/sanitizer.ts b/src/github/utils/sanitizer.ts new file mode 100644 index 0000000..db9979a --- /dev/null +++ b/src/github/utils/sanitizer.ts @@ -0,0 +1,64 @@ +export function stripInvisibleCharacters(content: string): string { + content = content.replace(/[\u200B\u200C\u200D\uFEFF]/g, ""); + content = content.replace( + /[\u0000-\u0008\u000B\u000C\u000E-\u001F\u007F-\u009F]/g, + "", + ); + content = content.replace(/\u00AD/g, ""); + content = content.replace(/[\u202A-\u202E\u2066-\u2069]/g, ""); + return content; +} + +export function stripMarkdownImageAltText(content: string): string { + return content.replace(/!\[[^\]]*\]\(/g, "![]("); +} + +export function stripMarkdownLinkTitles(content: string): string { + content = content.replace(/(\[[^\]]*\]\([^)]+)\s+"[^"]*"/g, "$1"); + content = content.replace(/(\[[^\]]*\]\([^)]+)\s+'[^']*'/g, "$1"); + return content; +} + +export function stripHiddenAttributes(content: string): string { + content = content.replace(/\salt\s*=\s*["'][^"']*["']/gi, ""); + content = content.replace(/\salt\s*=\s*[^\s>]+/gi, ""); + content = content.replace(/\stitle\s*=\s*["'][^"']*["']/gi, ""); + content = content.replace(/\stitle\s*=\s*[^\s>]+/gi, ""); + content = content.replace(/\saria-label\s*=\s*["'][^"']*["']/gi, ""); + content = content.replace(/\saria-label\s*=\s*[^\s>]+/gi, ""); + content = content.replace(/\sdata-[a-zA-Z0-9-]+\s*=\s*["'][^"']*["']/gi, ""); + content = content.replace(/\sdata-[a-zA-Z0-9-]+\s*=\s*[^\s>]+/gi, ""); + content = content.replace(/\splaceholder\s*=\s*["'][^"']*["']/gi, ""); + content = content.replace(/\splaceholder\s*=\s*[^\s>]+/gi, ""); + return content; +} + +export function normalizeHtmlEntities(content: string): string { + content = content.replace(/&#(\d+);/g, (_, dec) => { + const num = parseInt(dec, 10); + if (num >= 32 && num <= 126) { + return String.fromCharCode(num); + } + return ""; + }); + content = content.replace(/&#x([0-9a-fA-F]+);/g, (_, hex) => { + const num = parseInt(hex, 16); + if (num >= 32 && num <= 126) { + return String.fromCharCode(num); + } + return ""; + }); + return content; +} + +export function sanitizeContent(content: string): string { + content = stripInvisibleCharacters(content); + content = stripMarkdownImageAltText(content); + content = stripMarkdownLinkTitles(content); + content = stripHiddenAttributes(content); + content = normalizeHtmlEntities(content); + return content; +} + +export const stripHtmlComments = (content: string) => + content.replace(//g, ""); diff --git a/test-markdown.json b/test-markdown.json new file mode 100644 index 0000000..030c003 --- /dev/null +++ b/test-markdown.json @@ -0,0 +1,6 @@ +{ + "text": "# Test Rendering\n\n\![](https://github.githubassets.com/images/modules/logos_page/GitHub-Mark.png)\n\nThe image above has no alt text but should still render.", + "mode": "gfm", + "context": "anthropics/claude-code-action" +} +EOF < /dev/null \ No newline at end of file diff --git a/test/data-formatter.test.ts b/test/data-formatter.test.ts index 1729732..3181032 100644 --- a/test/data-formatter.test.ts +++ b/test/data-formatter.test.ts @@ -6,7 +6,6 @@ import { formatReviewComments, formatChangedFiles, formatChangedFilesWithSHA, - stripHtmlComments, } from "../src/github/data/formatter"; import type { GitHubPullRequest, @@ -99,9 +98,9 @@ Some more text.`; const result = formatBody(body, imageUrlMap); expect(result) - .toBe(`Here is some text with an image: ![screenshot](/tmp/github-images/image-1234-0.png) + .toBe(`Here is some text with an image: ![](/tmp/github-images/image-1234-0.png) -And another one: ![another](/tmp/github-images/image-1234-1.jpg) +And another one: ![](/tmp/github-images/image-1234-1.jpg) Some more text.`); }); @@ -124,7 +123,7 @@ Some more text.`); ]); const result = formatBody(body, imageUrlMap); - expect(result).toBe("![image](https://example.com/image.png)"); + expect(result).toBe("![](https://example.com/image.png)"); }); test("handles multiple occurrences of same image", () => { @@ -139,8 +138,8 @@ Second: ![img](https://github.com/user-attachments/assets/test.png)`; ]); const result = formatBody(body, imageUrlMap); - expect(result).toBe(`First: ![img](/tmp/github-images/image-1234-0.png) -Second: ![img](/tmp/github-images/image-1234-0.png)`); + expect(result).toBe(`First: ![](/tmp/github-images/image-1234-0.png) +Second: ![](/tmp/github-images/image-1234-0.png)`); }); }); @@ -205,7 +204,7 @@ describe("formatComments", () => { const result = formatComments(comments, imageUrlMap); expect(result).toBe( - `[user1 at 2023-01-01T00:00:00Z]: Check out this screenshot: ![screenshot](/tmp/github-images/image-1234-0.png)\n\n[user2 at 2023-01-02T00:00:00Z]: Here's another image: ![bug](/tmp/github-images/image-1234-1.jpg)`, + `[user1 at 2023-01-01T00:00:00Z]: Check out this screenshot: ![](/tmp/github-images/image-1234-0.png)\n\n[user2 at 2023-01-02T00:00:00Z]: Here's another image: ![](/tmp/github-images/image-1234-1.jpg)`, ); }); @@ -233,7 +232,7 @@ describe("formatComments", () => { const result = formatComments(comments, imageUrlMap); expect(result).toBe( - `[user1 at 2023-01-01T00:00:00Z]: Two images: ![first](/tmp/github-images/image-1234-0.png) and ![second](/tmp/github-images/image-1234-1.png)`, + `[user1 at 2023-01-01T00:00:00Z]: Two images: ![](/tmp/github-images/image-1234-0.png) and ![](/tmp/github-images/image-1234-1.png)`, ); }); @@ -250,7 +249,7 @@ describe("formatComments", () => { const result = formatComments(comments); expect(result).toBe( - `[user1 at 2023-01-01T00:00:00Z]: Image: ![test](https://github.com/user-attachments/assets/test.png)`, + `[user1 at 2023-01-01T00:00:00Z]: Image: ![](https://github.com/user-attachments/assets/test.png)`, ); }); }); @@ -294,7 +293,7 @@ describe("formatReviewComments", () => { const result = formatReviewComments(reviewData); expect(result).toBe( - `[Review by reviewer1 at 2023-01-01T00:00:00Z]: APPROVED\n [Comment on src/index.ts:42]: Nice implementation\n [Comment on src/utils.ts:?]: Consider adding error handling`, + `[Review by reviewer1 at 2023-01-01T00:00:00Z]: APPROVED\nThis is a great PR! LGTM.\n [Comment on src/index.ts:42]: Nice implementation\n [Comment on src/utils.ts:?]: Consider adding error handling`, ); }); @@ -317,7 +316,7 @@ describe("formatReviewComments", () => { const result = formatReviewComments(reviewData); expect(result).toBe( - `[Review by reviewer1 at 2023-01-01T00:00:00Z]: APPROVED`, + `[Review by reviewer1 at 2023-01-01T00:00:00Z]: APPROVED\nLooks good to me!`, ); }); @@ -384,7 +383,7 @@ describe("formatReviewComments", () => { const result = formatReviewComments(reviewData); expect(result).toBe( - `[Review by reviewer1 at 2023-01-01T00:00:00Z]: CHANGES_REQUESTED\n\n[Review by reviewer2 at 2023-01-02T00:00:00Z]: APPROVED`, + `[Review by reviewer1 at 2023-01-01T00:00:00Z]: CHANGES_REQUESTED\nNeeds changes\n\n[Review by reviewer2 at 2023-01-02T00:00:00Z]: APPROVED\nLGTM`, ); }); @@ -438,7 +437,7 @@ describe("formatReviewComments", () => { const result = formatReviewComments(reviewData, imageUrlMap); expect(result).toBe( - `[Review by reviewer1 at 2023-01-01T00:00:00Z]: APPROVED\n [Comment on src/index.ts:42]: Comment with image: ![comment-img](/tmp/github-images/image-1234-1.png)`, + `[Review by reviewer1 at 2023-01-01T00:00:00Z]: APPROVED\nReview with image: ![](/tmp/github-images/image-1234-0.png)\n [Comment on src/index.ts:42]: Comment with image: ![](/tmp/github-images/image-1234-1.png)`, ); }); @@ -482,7 +481,7 @@ describe("formatReviewComments", () => { const result = formatReviewComments(reviewData, imageUrlMap); expect(result).toBe( - `[Review by reviewer1 at 2023-01-01T00:00:00Z]: APPROVED\n [Comment on src/main.ts:15]: Two issues: ![issue1](/tmp/github-images/image-1234-0.png) and ![issue2](/tmp/github-images/image-1234-1.png)`, + `[Review by reviewer1 at 2023-01-01T00:00:00Z]: APPROVED\nGood work\n [Comment on src/main.ts:15]: Two issues: ![](/tmp/github-images/image-1234-0.png) and ![](/tmp/github-images/image-1234-1.png)`, ); }); @@ -515,7 +514,7 @@ describe("formatReviewComments", () => { const result = formatReviewComments(reviewData); expect(result).toBe( - `[Review by reviewer1 at 2023-01-01T00:00:00Z]: APPROVED\n [Comment on src/index.ts:42]: Image: ![test](https://github.com/user-attachments/assets/test.png)`, + `[Review by reviewer1 at 2023-01-01T00:00:00Z]: APPROVED\nReview body\n [Comment on src/index.ts:42]: Image: ![](https://github.com/user-attachments/assets/test.png)`, ); }); }); @@ -579,150 +578,3 @@ describe("formatChangedFilesWithSHA", () => { expect(result).toBe(""); }); }); - -describe("stripHtmlComments", () => { - test("strips simple HTML comments", () => { - const text = "Hello world"; - expect(stripHtmlComments(text)).toBe("Hello world"); - }); - - test("strips multiple HTML comments", () => { - const text = "Start middle end"; - expect(stripHtmlComments(text)).toBe("Start middle end"); - }); - - test("strips multi-line HTML comments", () => { - const text = `Line 1 - -Line 2`; - expect(stripHtmlComments(text)).toBe(`Line 1 - -Line 2`); - }); - - test("strips nested comment-like content", () => { - const text = "Text still in comment --> after"; - // HTML doesn't support true nested comments - the first --> ends the comment - expect(stripHtmlComments(text)).toBe("Text still in comment --> after"); - }); - - test("handles empty string", () => { - expect(stripHtmlComments("")).toBe(""); - }); - - test("handles text without comments", () => { - const text = "No comments here!"; - expect(stripHtmlComments(text)).toBe("No comments here!"); - }); - - test("strips complex hidden content with XML tags", () => { - const text = `Normal request - -More normal text`; - expect(stripHtmlComments(text)).toBe(`Normal request - -More normal text`); - }); - - test("handles malformed comments - no closing", () => { - const text = "Text is not stripped - expect(stripHtmlComments(text)).toBe("Text comment"; - // Just --> without opening comment"); - }); - - test("preserves legitimate HTML-like content outside comments", () => { - const text = "Use the
tag and
closing tag"; - expect(stripHtmlComments(text)).toBe( - "Use the
tag and
closing tag", - ); - }); -}); - -describe("formatBody with HTML comment stripping", () => { - test("strips HTML comments from body", () => { - const body = "Issue description visible text"; - const imageUrlMap = new Map(); - - const result = formatBody(body, imageUrlMap); - expect(result).toBe("Issue description visible text"); - }); - - test("strips HTML comments and replaces images", () => { - const body = `Check this ![img](https://github.com/user-attachments/assets/test.png)`; - const imageUrlMap = new Map([ - [ - "https://github.com/user-attachments/assets/test.png", - "/tmp/github-images/image-1234-0.png", - ], - ]); - - const result = formatBody(body, imageUrlMap); - expect(result).toBe( - "Check this ![img](/tmp/github-images/image-1234-0.png)", - ); - }); -}); - -describe("formatComments with HTML comment stripping", () => { - test("strips HTML comments from comment bodies", () => { - const comments: GitHubComment[] = [ - { - id: "1", - databaseId: "100001", - body: "Good work on this PR", - author: { login: "user1" }, - createdAt: "2023-01-01T00:00:00Z", - }, - ]; - - const result = formatComments(comments); - expect(result).toBe( - "[user1 at 2023-01-01T00:00:00Z]: Good work on this PR", - ); - }); -}); - -describe("formatReviewComments with HTML comment stripping", () => { - test("strips HTML comments from review comment bodies", () => { - const reviewData = { - nodes: [ - { - id: "review1", - databaseId: "300001", - author: { login: "reviewer1" }, - body: "LGTM", - state: "APPROVED", - submittedAt: "2023-01-01T00:00:00Z", - comments: { - nodes: [ - { - id: "comment1", - databaseId: "200001", - body: "Nice work here", - author: { login: "reviewer1" }, - createdAt: "2023-01-01T00:00:00Z", - path: "src/index.ts", - line: 42, - }, - ], - }, - }, - ], - }; - - const result = formatReviewComments(reviewData); - expect(result).toBe( - `[Review by reviewer1 at 2023-01-01T00:00:00Z]: APPROVED\n [Comment on src/index.ts:42]: Nice work here`, - ); - }); -}); diff --git a/test/integration-sanitization.test.ts b/test/integration-sanitization.test.ts new file mode 100644 index 0000000..32bf232 --- /dev/null +++ b/test/integration-sanitization.test.ts @@ -0,0 +1,156 @@ +import { describe, expect, it } from "bun:test"; +import { formatBody, formatComments } from "../src/github/data/formatter"; +import type { GitHubComment } from "../src/github/types"; + +describe("Integration: Text Sanitization", () => { + it("should sanitize text in issue body", () => { + const body = ` +# Title text + +Some content here. + +Here's an image: some alt text + +And a markdown image: ![image text](screenshot.png) + +Check this link: [Click here](https://example.com "link title") + +Text with hidden​‌‍characters + +
+ Content with attributes +
+ +Entity-encoded: HELLO + +Direction: ‮reversed‬ text + + + +Text­with­soft­hyphens + +More text: with‌zero‍width​characters`; + + const imageUrlMap = new Map(); + const result = formatBody(body, imageUrlMap); + + expect(result).not.toContain("some alt text"); + expect(result).not.toContain("image text"); + expect(result).not.toContain("link title"); + expect(result).not.toContain("test data"); + expect(result).not.toContain("label text"); + expect(result).not.toContain("title text"); + expect(result).not.toContain("placeholder text"); + expect(result).not.toContain('alt="'); + expect(result).not.toContain('title="'); + expect(result).not.toContain('aria-label="'); + expect(result).not.toContain('data-prompt="'); + expect(result).not.toContain('placeholder="'); + expect(result).not.toContain("\u200B"); + expect(result).not.toContain("\u200C"); + expect(result).not.toContain("\u200D"); + expect(result).not.toContain("\u00AD"); + expect(result).not.toContain("\u202E"); + expect(result).not.toContain("H"); + + expect(result).toContain("# Title text"); + expect(result).toContain("Some content here."); + expect(result).toContain("Here's an image:"); + expect(result).toContain(''); + expect(result).toContain("![](screenshot.png)"); + expect(result).toContain("[Click here](https://example.com)"); + expect(result).toContain("Content with attributes"); + expect(result).toContain("HELLO"); + expect(result).toContain(''); + }); + + it("should sanitize text in comments", () => { + const comments: GitHubComment[] = [ + { + id: "1", + databaseId: "100001", + body: `Comment text + +Check this: ![description text](image.png) +[Documentation](https://docs.com "doc title") + +Text​‌‍with characters + +Visible text`, + author: { login: "user1" }, + createdAt: "2023-01-01T00:00:00Z", + }, + ]; + + const result = formatComments(comments); + + expect(result).not.toContain("description text"); + expect(result).not.toContain("doc title"); + expect(result).not.toContain("span label"); + expect(result).not.toContain("data value"); + expect(result).not.toContain('aria-label="'); + expect(result).not.toContain('data-cmd="'); + expect(result).not.toContain("\u200B"); + expect(result).not.toContain("\u200C"); + expect(result).not.toContain("\u200D"); + + expect(result).toContain("Comment text"); + expect(result).toContain("![](image.png)"); + expect(result).toContain("[Documentation](https://docs.com)"); + expect(result).toContain("Visible text"); + expect(result).toContain("Textwith characters"); + }); + + it("should handle complex mixed patterns", () => { + const content = ` +Text content here. + +
+ img­alt­text + Text with ‮reversed‬ content +
+ +![alt text\u200Bwith\u200Ccharacters](image.png) + +[link](url.com "title\u00ADtext") + +Mix: Hidden text`; + + const imageUrlMap = new Map(); + const result = formatBody(content, imageUrlMap); + + expect(result).not.toContain('title="'); + expect(result).not.toContain('data-instruction="'); + expect(result).not.toContain('alt="'); + expect(result).not.toContain('aria-label="'); + expect(result).not.toContain("\u200B"); + expect(result).not.toContain("\u200C"); + expect(result).not.toContain("\u00AD"); + expect(result).not.toContain("\u202E"); + + expect(result).toContain("Text content here."); + expect(result).toContain("
"); + expect(result).toContain(''); + expect(result).toContain("![](image.png)"); + expect(result).toContain("[link](url.com)"); + expect(result).toContain("Hidden text"); + }); + + it("should handle edge cases with empty attributes", () => { + const edgeCases = ` + +
Content
+![](already-empty.png) +[link](url.com) +Normal text`; + + const imageUrlMap = new Map(); + const result = formatBody(edgeCases, imageUrlMap); + + expect(result).toContain(''); + expect(result).toContain("
Content
"); + expect(result).toContain("![](already-empty.png)"); + expect(result).toContain("[link](url.com)"); + expect(result).toContain("Normal text"); + }); +}); diff --git a/test/sanitizer.test.ts b/test/sanitizer.test.ts new file mode 100644 index 0000000..94e7bda --- /dev/null +++ b/test/sanitizer.test.ts @@ -0,0 +1,274 @@ +import { describe, expect, it } from "bun:test"; +import { + stripInvisibleCharacters, + stripMarkdownImageAltText, + stripMarkdownLinkTitles, + stripHiddenAttributes, + normalizeHtmlEntities, + sanitizeContent, + stripHtmlComments, +} from "../src/github/utils/sanitizer"; + +describe("stripInvisibleCharacters", () => { + it("should remove zero-width characters", () => { + expect(stripInvisibleCharacters("Hello\u200BWorld")).toBe("HelloWorld"); + expect(stripInvisibleCharacters("Text\u200C\u200D")).toBe("Text"); + expect(stripInvisibleCharacters("\uFEFFStart")).toBe("Start"); + }); + + it("should remove control characters", () => { + expect(stripInvisibleCharacters("Hello\u0000World")).toBe("HelloWorld"); + expect(stripInvisibleCharacters("Text\u001F\u007F")).toBe("Text"); + }); + + it("should preserve common whitespace", () => { + expect(stripInvisibleCharacters("Hello\nWorld")).toBe("Hello\nWorld"); + expect(stripInvisibleCharacters("Tab\there")).toBe("Tab\there"); + expect(stripInvisibleCharacters("Carriage\rReturn")).toBe( + "Carriage\rReturn", + ); + }); + + it("should remove soft hyphens", () => { + expect(stripInvisibleCharacters("Soft\u00ADHyphen")).toBe("SoftHyphen"); + }); + + it("should remove Unicode direction overrides", () => { + expect(stripInvisibleCharacters("Text\u202A\u202BMore")).toBe("TextMore"); + expect(stripInvisibleCharacters("\u2066Isolated\u2069")).toBe("Isolated"); + }); +}); + +describe("stripMarkdownImageAltText", () => { + it("should remove alt text from markdown images", () => { + expect(stripMarkdownImageAltText("![example alt text](image.png)")).toBe( + "![](image.png)", + ); + expect(stripMarkdownImageAltText("Text ![description](pic.jpg) more text")).toBe( + "Text ![](pic.jpg) more text", + ); + }); + + it("should handle multiple images", () => { + expect(stripMarkdownImageAltText("![one](1.png) ![two](2.png)")).toBe( + "![](1.png) ![](2.png)", + ); + }); + + it("should handle empty alt text", () => { + expect(stripMarkdownImageAltText("![](image.png)")).toBe("![](image.png)"); + }); +}); + +describe("stripMarkdownLinkTitles", () => { + it("should remove titles from markdown links", () => { + expect(stripMarkdownLinkTitles('[Link](url.com "example title")')).toBe( + "[Link](url.com)", + ); + expect(stripMarkdownLinkTitles("[Link](url.com 'example title')")).toBe( + "[Link](url.com)", + ); + }); + + it("should handle multiple links", () => { + expect( + stripMarkdownLinkTitles('[One](1.com "first") [Two](2.com "second")'), + ).toBe("[One](1.com) [Two](2.com)"); + }); + + it("should preserve links without titles", () => { + expect(stripMarkdownLinkTitles("[Link](url.com)")).toBe("[Link](url.com)"); + }); +}); + +describe("stripHiddenAttributes", () => { + it("should remove alt attributes", () => { + expect(stripHiddenAttributes('example text')).toBe( + '', + ); + expect(stripHiddenAttributes("example")).toBe( + '', + ); + expect(stripHiddenAttributes('example')).toBe( + '', + ); + }); + + it("should remove title attributes", () => { + expect( + stripHiddenAttributes('Link'), + ).toBe('Link'); + expect(stripHiddenAttributes("
Content
")).toBe( + "
Content
", + ); + }); + + it("should remove aria-label attributes", () => { + expect( + stripHiddenAttributes(''), + ).toBe(""); + }); + + it("should remove data-* attributes", () => { + expect( + stripHiddenAttributes( + '
Text
', + ), + ).toBe("
Text
"); + }); + + it("should remove placeholder attributes", () => { + expect( + stripHiddenAttributes(''), + ).toBe(''); + }); + + it("should handle multiple attributes", () => { + expect( + stripHiddenAttributes( + 'example', + ), + ).toBe(''); + }); +}); + +describe("normalizeHtmlEntities", () => { + it("should decode numeric entities", () => { + expect(normalizeHtmlEntities("Hello")).toBe( + "Hello", + ); + expect(normalizeHtmlEntities("ABC")).toBe("ABC"); + }); + + it("should decode hex entities", () => { + expect(normalizeHtmlEntities("Hello")).toBe( + "Hello", + ); + expect(normalizeHtmlEntities("ABC")).toBe("ABC"); + }); + + it("should remove non-printable entities", () => { + expect(normalizeHtmlEntities("�")).toBe(""); + expect(normalizeHtmlEntities("�")).toBe(""); + }); + + it("should preserve normal text", () => { + expect(normalizeHtmlEntities("Normal text")).toBe("Normal text"); + }); +}); + +describe("sanitizeContent", () => { + it("should apply all sanitization measures", () => { + const testContent = ` + + example alt text + ![example image description](screenshot.png) + [click here](https://example.com "example title") +
+ Normal text with hidden\u200Bcharacters +
+ Hidden message + `; + + const sanitized = sanitizeContent(testContent); + + expect(sanitized).not.toContain("example alt text"); + expect(sanitized).not.toContain("example image description"); + expect(sanitized).not.toContain("example title"); + expect(sanitized).not.toContain("example data"); + expect(sanitized).not.toContain("example label"); + expect(sanitized).not.toContain("\u200B"); + expect(sanitized).not.toContain("alt="); + expect(sanitized).not.toContain("data-prompt="); + expect(sanitized).not.toContain("aria-label="); + + expect(sanitized).toContain("Normal text with hiddencharacters"); + expect(sanitized).toContain("Hidden message"); + expect(sanitized).toContain(''); + expect(sanitized).toContain("![](screenshot.png)"); + expect(sanitized).toContain("[click here](https://example.com)"); + }); + + it("should handle complex nested patterns", () => { + const complexContent = ` + Text with ![alt \u200B text](image.png) and more. + Link +
Content
+ `; + + const sanitized = sanitizeContent(complexContent); + + expect(sanitized).not.toContain("\u200B"); + expect(sanitized).not.toContain("\u00AD"); + expect(sanitized).not.toContain("alt "); + expect(sanitized).not.toContain('title="'); + expect(sanitized).not.toContain('data-x="'); + expect(sanitized).toContain("![](image.png)"); + expect(sanitized).toContain('Link'); + }); + + it("should preserve legitimate markdown and HTML", () => { + const legitimateContent = ` + # Heading + + This is **bold** and *italic* text. + + Here's a normal image: ![](normal.jpg) + And a normal link: [Click here](https://example.com) + +
+

Normal paragraph

+ +
+ `; + + const sanitized = sanitizeContent(legitimateContent); + + expect(sanitized).toBe(legitimateContent); + }); + + it("should handle entity-encoded text", () => { + const encodedText = ` + Hidden message +
Test
+ `; + + const sanitized = sanitizeContent(encodedText); + + expect(sanitized).toContain("Hidden message"); + expect(sanitized).not.toContain('title="'); + expect(sanitized).toContain("
Test
"); + }); + + it("should handle mixed input patterns", () => { + const mixedInput = ` + ![example\u200Btext\u00ADwith\u00ADcharacters](image.png) + example + [link](url.com "title\u202Ewith\u202Ccharacters") + visible text + `; + + const sanitized = sanitizeContent(mixedInput); + + expect(sanitized).not.toContain("example"); + expect(sanitized).not.toContain("characters"); + expect(sanitized).not.toContain("title"); + expect(sanitized).not.toContain("data value"); + expect(sanitized).not.toContain("label text"); + expect(sanitized).toContain("visible text"); + }); +}); + +describe("stripHtmlComments (legacy)", () => { + it("should remove HTML comments", () => { + expect(stripHtmlComments("Hello World")).toBe("Hello World"); + expect(stripHtmlComments("Text")).toBe("Text"); + expect(stripHtmlComments("Text")).toBe("Text"); + }); + + it("should handle multiline comments", () => { + expect(stripHtmlComments("Hello World")).toBe( + "Hello World", + ); + }); +}); \ No newline at end of file