Add enhanced text sanitization

2026-01-23 15:04:13 +08:00 · 2025-05-28 17:29:09 -07:00
parent 176dbc369d
commit 61cd297c18
8 changed files with 541 additions and 175 deletions
--- a/rendered.html
+++ b/rendered.html
@@ -0,0 +1 @@
+{"message":"Problems parsing JSON","documentation_url":"https://docs.github.com/rest/markdown/markdown#render-a-markdown-document","status":"400"}
--- a/src/create-prompt/index.ts
+++ b/src/create-prompt/index.ts
@@ -9,8 +9,8 @@ import {
  formatComments,
  formatReviewComments,
  formatChangedFilesWithSHA,
-  stripHtmlComments,
 } from "../github/data/formatter";
+import { sanitizeContent } from "../github/utils/sanitizer";
 import {
  isIssuesEvent,
  isIssueCommentEvent,
@@ -419,14 +419,14 @@ ${
    eventData.eventName === "pull_request_review") &&
  eventData.commentBody
    ? `<trigger_comment>
-${stripHtmlComments(eventData.commentBody)}
+${sanitizeContent(eventData.commentBody)}
 </trigger_comment>`
    : ""
 }
 ${
  context.directPrompt
    ? `<direct_prompt>
-${stripHtmlComments(context.directPrompt)}
+${sanitizeContent(context.directPrompt)}
 </direct_prompt>`
    : ""
 }
--- a/src/github/data/formatter.ts
+++ b/src/github/data/formatter.ts
@@ -6,10 +6,7 @@ import type {
  GitHubReview,
 } from "../types";
 import type { GitHubFileWithSHA } from "./fetcher";
-
-export function stripHtmlComments(text: string): string {
-  return text.replace(/<!--[\s\S]*?-->/g, "");
-}
+import { sanitizeContent } from "../utils/sanitizer";

 export function formatContext(
  contextData: GitHubPullRequest | GitHubIssue,
@@ -37,13 +34,14 @@ export function formatBody(
  body: string,
  imageUrlMap: Map<string, string>,
 ): string {
-  let processedBody = stripHtmlComments(body);
+  let processedBody = body;

-  // Replace image URLs with local paths
  for (const [originalUrl, localPath] of imageUrlMap) {
    processedBody = processedBody.replaceAll(originalUrl, localPath);
  }

+  processedBody = sanitizeContent(processedBody);
+
  return processedBody;
 }

@@ -53,15 +51,16 @@ export function formatComments(
 ): string {
  return comments
    .map((comment) => {
-      let body = stripHtmlComments(comment.body);
+      let body = comment.body;

-      // Replace image URLs with local paths if we have a mapping
      if (imageUrlMap && body) {
        for (const [originalUrl, localPath] of imageUrlMap) {
          body = body.replaceAll(originalUrl, localPath);
        }
      }

+      body = sanitizeContent(body);
+
      return `[${comment.author.login} at ${comment.createdAt}]: ${body}`;
    })
    .join("\n\n");
@@ -78,6 +77,19 @@ export function formatReviewComments(
  const formattedReviews = reviewData.nodes.map((review) => {
    let reviewOutput = `[Review by ${review.author.login} at ${review.submittedAt}]: ${review.state}`;

+    if (review.body && review.body.trim()) {
+      let body = review.body;
+
+      if (imageUrlMap) {
+        for (const [originalUrl, localPath] of imageUrlMap) {
+          body = body.replaceAll(originalUrl, localPath);
+        }
+      }
+
+      const sanitizedBody = sanitizeContent(body);
+      reviewOutput += `\n${sanitizedBody}`;
+    }
+
    if (
      review.comments &&
      review.comments.nodes &&
@@ -85,15 +97,16 @@ export function formatReviewComments(
    ) {
      const comments = review.comments.nodes
        .map((comment) => {
-          let body = stripHtmlComments(comment.body);
+          let body = comment.body;

-          // Replace image URLs with local paths if we have a mapping
          if (imageUrlMap) {
            for (const [originalUrl, localPath] of imageUrlMap) {
              body = body.replaceAll(originalUrl, localPath);
            }
          }

+          body = sanitizeContent(body);
+
          return `  [Comment on ${comment.path}:${comment.line || "?"}]: ${body}`;
        })
        .join("\n");
--- a/src/github/utils/sanitizer.ts
+++ b/src/github/utils/sanitizer.ts
@@ -0,0 +1,64 @@
+export function stripInvisibleCharacters(content: string): string {
+  content = content.replace(/[\u200B\u200C\u200D\uFEFF]/g, "");
+  content = content.replace(
+    /[\u0000-\u0008\u000B\u000C\u000E-\u001F\u007F-\u009F]/g,
+    "",
+  );
+  content = content.replace(/\u00AD/g, "");
+  content = content.replace(/[\u202A-\u202E\u2066-\u2069]/g, "");
+  return content;
+}
+
+export function stripMarkdownImageAltText(content: string): string {
+  return content.replace(/!\[[^\]]*\]\(/g, "![](");
+}
+
+export function stripMarkdownLinkTitles(content: string): string {
+  content = content.replace(/(\[[^\]]*\]\([^)]+)\s+"[^"]*"/g, "$1");
+  content = content.replace(/(\[[^\]]*\]\([^)]+)\s+'[^']*'/g, "$1");
+  return content;
+}
+
+export function stripHiddenAttributes(content: string): string {
+  content = content.replace(/\salt\s*=\s*["'][^"']*["']/gi, "");
+  content = content.replace(/\salt\s*=\s*[^\s>]+/gi, "");
+  content = content.replace(/\stitle\s*=\s*["'][^"']*["']/gi, "");
+  content = content.replace(/\stitle\s*=\s*[^\s>]+/gi, "");
+  content = content.replace(/\saria-label\s*=\s*["'][^"']*["']/gi, "");
+  content = content.replace(/\saria-label\s*=\s*[^\s>]+/gi, "");
+  content = content.replace(/\sdata-[a-zA-Z0-9-]+\s*=\s*["'][^"']*["']/gi, "");
+  content = content.replace(/\sdata-[a-zA-Z0-9-]+\s*=\s*[^\s>]+/gi, "");
+  content = content.replace(/\splaceholder\s*=\s*["'][^"']*["']/gi, "");
+  content = content.replace(/\splaceholder\s*=\s*[^\s>]+/gi, "");
+  return content;
+}
+
+export function normalizeHtmlEntities(content: string): string {
+  content = content.replace(/&#(\d+);/g, (_, dec) => {
+    const num = parseInt(dec, 10);
+    if (num >= 32 && num <= 126) {
+      return String.fromCharCode(num);
+    }
+    return "";
+  });
+  content = content.replace(/&#x([0-9a-fA-F]+);/g, (_, hex) => {
+    const num = parseInt(hex, 16);
+    if (num >= 32 && num <= 126) {
+      return String.fromCharCode(num);
+    }
+    return "";
+  });
+  return content;
+}
+
+export function sanitizeContent(content: string): string {
+  content = stripInvisibleCharacters(content);
+  content = stripMarkdownImageAltText(content);
+  content = stripMarkdownLinkTitles(content);
+  content = stripHiddenAttributes(content);
+  content = normalizeHtmlEntities(content);
+  return content;
+}
+
+export const stripHtmlComments = (content: string) =>
+  content.replace(/<!--[\s\S]*?-->/g, "");
--- a/test-markdown.json
+++ b/test-markdown.json
@@ -0,0 +1,6 @@
+{
+  "text": "# Test Rendering\n\n\![](https://github.githubassets.com/images/modules/logos_page/GitHub-Mark.png)\n\nThe image above has no alt text but should still render.",
+  "mode": "gfm",
+  "context": "anthropics/claude-code-action"
+}
+EOF < /dev/null
--- a/test/data-formatter.test.ts
+++ b/test/data-formatter.test.ts
@@ -6,7 +6,6 @@ import {
  formatReviewComments,
  formatChangedFiles,
  formatChangedFilesWithSHA,
-  stripHtmlComments,
 } from "../src/github/data/formatter";
 import type {
  GitHubPullRequest,
@@ -99,9 +98,9 @@ Some more text.`;

    const result = formatBody(body, imageUrlMap);
    expect(result)
-      .toBe(`Here is some text with an image: ![screenshot](/tmp/github-images/image-1234-0.png)
+      .toBe(`Here is some text with an image: ![](/tmp/github-images/image-1234-0.png)
    
-And another one: ![another](/tmp/github-images/image-1234-1.jpg)
+And another one: ![](/tmp/github-images/image-1234-1.jpg)

 Some more text.`);
  });
@@ -124,7 +123,7 @@ Some more text.`);
    ]);

    const result = formatBody(body, imageUrlMap);
-    expect(result).toBe("![image](https://example.com/image.png)");
+    expect(result).toBe("![](https://example.com/image.png)");
  });

  test("handles multiple occurrences of same image", () => {
@@ -139,8 +138,8 @@ Second: ![img](https://github.com/user-attachments/assets/test.png)`;
    ]);

    const result = formatBody(body, imageUrlMap);
-    expect(result).toBe(`First: ![img](/tmp/github-images/image-1234-0.png)
-Second: ![img](/tmp/github-images/image-1234-0.png)`);
+    expect(result).toBe(`First: ![](/tmp/github-images/image-1234-0.png)
+Second: ![](/tmp/github-images/image-1234-0.png)`);
  });
 });

@@ -205,7 +204,7 @@ describe("formatComments", () => {

    const result = formatComments(comments, imageUrlMap);
    expect(result).toBe(
-      `[user1 at 2023-01-01T00:00:00Z]: Check out this screenshot: ![screenshot](/tmp/github-images/image-1234-0.png)\n\n[user2 at 2023-01-02T00:00:00Z]: Here's another image: ![bug](/tmp/github-images/image-1234-1.jpg)`,
+      `[user1 at 2023-01-01T00:00:00Z]: Check out this screenshot: ![](/tmp/github-images/image-1234-0.png)\n\n[user2 at 2023-01-02T00:00:00Z]: Here's another image: ![](/tmp/github-images/image-1234-1.jpg)`,
    );
  });

@@ -233,7 +232,7 @@ describe("formatComments", () => {

    const result = formatComments(comments, imageUrlMap);
    expect(result).toBe(
-      `[user1 at 2023-01-01T00:00:00Z]: Two images: ![first](/tmp/github-images/image-1234-0.png) and ![second](/tmp/github-images/image-1234-1.png)`,
+      `[user1 at 2023-01-01T00:00:00Z]: Two images: ![](/tmp/github-images/image-1234-0.png) and ![](/tmp/github-images/image-1234-1.png)`,
    );
  });

@@ -250,7 +249,7 @@ describe("formatComments", () => {

    const result = formatComments(comments);
    expect(result).toBe(
-      `[user1 at 2023-01-01T00:00:00Z]: Image: ![test](https://github.com/user-attachments/assets/test.png)`,
+      `[user1 at 2023-01-01T00:00:00Z]: Image: ![](https://github.com/user-attachments/assets/test.png)`,
    );
  });
 });
@@ -294,7 +293,7 @@ describe("formatReviewComments", () => {

    const result = formatReviewComments(reviewData);
    expect(result).toBe(
-      `[Review by reviewer1 at 2023-01-01T00:00:00Z]: APPROVED\n  [Comment on src/index.ts:42]: Nice implementation\n  [Comment on src/utils.ts:?]: Consider adding error handling`,
+      `[Review by reviewer1 at 2023-01-01T00:00:00Z]: APPROVED\nThis is a great PR! LGTM.\n  [Comment on src/index.ts:42]: Nice implementation\n  [Comment on src/utils.ts:?]: Consider adding error handling`,
    );
  });

@@ -317,7 +316,7 @@ describe("formatReviewComments", () => {

    const result = formatReviewComments(reviewData);
    expect(result).toBe(
-      `[Review by reviewer1 at 2023-01-01T00:00:00Z]: APPROVED`,
+      `[Review by reviewer1 at 2023-01-01T00:00:00Z]: APPROVED\nLooks good to me!`,
    );
  });

@@ -384,7 +383,7 @@ describe("formatReviewComments", () => {

    const result = formatReviewComments(reviewData);
    expect(result).toBe(
-      `[Review by reviewer1 at 2023-01-01T00:00:00Z]: CHANGES_REQUESTED\n\n[Review by reviewer2 at 2023-01-02T00:00:00Z]: APPROVED`,
+      `[Review by reviewer1 at 2023-01-01T00:00:00Z]: CHANGES_REQUESTED\nNeeds changes\n\n[Review by reviewer2 at 2023-01-02T00:00:00Z]: APPROVED\nLGTM`,
    );
  });

@@ -438,7 +437,7 @@ describe("formatReviewComments", () => {

    const result = formatReviewComments(reviewData, imageUrlMap);
    expect(result).toBe(
-      `[Review by reviewer1 at 2023-01-01T00:00:00Z]: APPROVED\n  [Comment on src/index.ts:42]: Comment with image: ![comment-img](/tmp/github-images/image-1234-1.png)`,
+      `[Review by reviewer1 at 2023-01-01T00:00:00Z]: APPROVED\nReview with image: ![](/tmp/github-images/image-1234-0.png)\n  [Comment on src/index.ts:42]: Comment with image: ![](/tmp/github-images/image-1234-1.png)`,
    );
  });

@@ -482,7 +481,7 @@ describe("formatReviewComments", () => {

    const result = formatReviewComments(reviewData, imageUrlMap);
    expect(result).toBe(
-      `[Review by reviewer1 at 2023-01-01T00:00:00Z]: APPROVED\n  [Comment on src/main.ts:15]: Two issues: ![issue1](/tmp/github-images/image-1234-0.png) and ![issue2](/tmp/github-images/image-1234-1.png)`,
+      `[Review by reviewer1 at 2023-01-01T00:00:00Z]: APPROVED\nGood work\n  [Comment on src/main.ts:15]: Two issues: ![](/tmp/github-images/image-1234-0.png) and ![](/tmp/github-images/image-1234-1.png)`,
    );
  });

@@ -515,7 +514,7 @@ describe("formatReviewComments", () => {

    const result = formatReviewComments(reviewData);
    expect(result).toBe(
-      `[Review by reviewer1 at 2023-01-01T00:00:00Z]: APPROVED\n  [Comment on src/index.ts:42]: Image: ![test](https://github.com/user-attachments/assets/test.png)`,
+      `[Review by reviewer1 at 2023-01-01T00:00:00Z]: APPROVED\nReview body\n  [Comment on src/index.ts:42]: Image: ![](https://github.com/user-attachments/assets/test.png)`,
    );
  });
 });
@@ -579,150 +578,3 @@ describe("formatChangedFilesWithSHA", () => {
    expect(result).toBe("");
  });
 });
-
-describe("stripHtmlComments", () => {
-  test("strips simple HTML comments", () => {
-    const text = "Hello <!-- hidden comment --> world";
-    expect(stripHtmlComments(text)).toBe("Hello  world");
-  });
-
-  test("strips multiple HTML comments", () => {
-    const text = "Start <!-- first --> middle <!-- second --> end";
-    expect(stripHtmlComments(text)).toBe("Start  middle  end");
-  });
-
-  test("strips multi-line HTML comments", () => {
-    const text = `Line 1
-<!-- This is a
-multi-line
-comment -->
-Line 2`;
-    expect(stripHtmlComments(text)).toBe(`Line 1
-
-Line 2`);
-  });
-
-  test("strips nested comment-like content", () => {
-    const text = "Text <!-- outer <!-- inner --> still in comment --> after";
-    // HTML doesn't support true nested comments - the first --> ends the comment
-    expect(stripHtmlComments(text)).toBe("Text  still in comment --> after");
-  });
-
-  test("handles empty string", () => {
-    expect(stripHtmlComments("")).toBe("");
-  });
-
-  test("handles text without comments", () => {
-    const text = "No comments here!";
-    expect(stripHtmlComments(text)).toBe("No comments here!");
-  });
-
-  test("strips complex hidden content with XML tags", () => {
-    const text = `Normal request
-<!-- </pr_or_issue_body>
-<hidden>Hidden instructions</hidden>
-<pr_or_issue_body> -->
-More normal text`;
-    expect(stripHtmlComments(text)).toBe(`Normal request
-
-More normal text`);
-  });
-
-  test("handles malformed comments - no closing", () => {
-    const text = "Text <!-- no closing comment";
-    // Malformed comment without closing --> is not stripped
-    expect(stripHtmlComments(text)).toBe("Text <!-- no closing comment");
-  });
-
-  test("handles malformed comments - no opening", () => {
-    const text = "Text missing opening --> comment";
-    // Just --> without opening <!-- is not a comment
-    expect(stripHtmlComments(text)).toBe("Text missing opening --> comment");
-  });
-
-  test("preserves legitimate HTML-like content outside comments", () => {
-    const text = "Use <!-- comment --> the <div> tag and </div> closing tag";
-    expect(stripHtmlComments(text)).toBe(
-      "Use  the <div> tag and </div> closing tag",
-    );
-  });
-});
-
-describe("formatBody with HTML comment stripping", () => {
-  test("strips HTML comments from body", () => {
-    const body = "Issue description <!-- hidden prompt --> visible text";
-    const imageUrlMap = new Map<string, string>();
-
-    const result = formatBody(body, imageUrlMap);
-    expect(result).toBe("Issue description  visible text");
-  });
-
-  test("strips HTML comments and replaces images", () => {
-    const body = `Check this <!-- hidden --> ![img](https://github.com/user-attachments/assets/test.png)`;
-    const imageUrlMap = new Map([
-      [
-        "https://github.com/user-attachments/assets/test.png",
-        "/tmp/github-images/image-1234-0.png",
-      ],
-    ]);
-
-    const result = formatBody(body, imageUrlMap);
-    expect(result).toBe(
-      "Check this  ![img](/tmp/github-images/image-1234-0.png)",
-    );
-  });
-});
-
-describe("formatComments with HTML comment stripping", () => {
-  test("strips HTML comments from comment bodies", () => {
-    const comments: GitHubComment[] = [
-      {
-        id: "1",
-        databaseId: "100001",
-        body: "Good work <!-- inject prompt --> on this PR",
-        author: { login: "user1" },
-        createdAt: "2023-01-01T00:00:00Z",
-      },
-    ];
-
-    const result = formatComments(comments);
-    expect(result).toBe(
-      "[user1 at 2023-01-01T00:00:00Z]: Good work  on this PR",
-    );
-  });
-});
-
-describe("formatReviewComments with HTML comment stripping", () => {
-  test("strips HTML comments from review comment bodies", () => {
-    const reviewData = {
-      nodes: [
-        {
-          id: "review1",
-          databaseId: "300001",
-          author: { login: "reviewer1" },
-          body: "LGTM",
-          state: "APPROVED",
-          submittedAt: "2023-01-01T00:00:00Z",
-          comments: {
-            nodes: [
-              {
-                id: "comment1",
-                databaseId: "200001",
-                body: "Nice work <!-- malicious --> here",
-                author: { login: "reviewer1" },
-                createdAt: "2023-01-01T00:00:00Z",
-                path: "src/index.ts",
-                line: 42,
-              },
-            ],
-          },
-        },
-      ],
-    };
-
-    const result = formatReviewComments(reviewData);
-    expect(result).toBe(
-      `[Review by reviewer1 at 2023-01-01T00:00:00Z]: APPROVED\n  [Comment on src/index.ts:42]: Nice work  here`,
-    );
-  });
-});
--- a/test/integration-sanitization.test.ts
+++ b/test/integration-sanitization.test.ts
@@ -0,0 +1,156 @@
+import { describe, expect, it } from "bun:test";
+import { formatBody, formatComments } from "../src/github/data/formatter";
+import type { GitHubComment } from "../src/github/types";
+
+describe("Integration: Text Sanitization", () => {
+  it("should sanitize text in issue body", () => {
+    const body = `
+# Title text
+
+Some content here.
+
+Here's an image: <img alt="some alt text" src="image.jpg">
+
+And a markdown image: ![image text](screenshot.png)
+
+Check this link: [Click here](https://example.com "link title")
+
+Text with hidden‌‍characters
+
+<div data-prompt="test data" aria-label="label text" title="title text">
+  Content with attributes
+</div>
+
+Entity-encoded: &#72;&#69;&#76;&#76;&#79;
+
+Direction: ‮reversed‬ text
+
+<input placeholder="placeholder text" type="text">
+
+Textwithsofthyphens
+
+More text: with‌zero‍widthcharacters`;
+
+    const imageUrlMap = new Map<string, string>();
+    const result = formatBody(body, imageUrlMap);
+
+    expect(result).not.toContain("some alt text");
+    expect(result).not.toContain("image text");
+    expect(result).not.toContain("link title");
+    expect(result).not.toContain("test data");
+    expect(result).not.toContain("label text");
+    expect(result).not.toContain("title text");
+    expect(result).not.toContain("placeholder text");
+    expect(result).not.toContain('alt="');
+    expect(result).not.toContain('title="');
+    expect(result).not.toContain('aria-label="');
+    expect(result).not.toContain('data-prompt="');
+    expect(result).not.toContain('placeholder="');
+    expect(result).not.toContain("\u200B");
+    expect(result).not.toContain("\u200C");
+    expect(result).not.toContain("\u200D");
+    expect(result).not.toContain("\u00AD");
+    expect(result).not.toContain("\u202E");
+    expect(result).not.toContain("&#72;");
+
+    expect(result).toContain("# Title text");
+    expect(result).toContain("Some content here.");
+    expect(result).toContain("Here's an image:");
+    expect(result).toContain('<img src="image.jpg">');
+    expect(result).toContain("![](screenshot.png)");
+    expect(result).toContain("[Click here](https://example.com)");
+    expect(result).toContain("Content with attributes");
+    expect(result).toContain("HELLO");
+    expect(result).toContain('<input type="text">');
+  });
+
+  it("should sanitize text in comments", () => {
+    const comments: GitHubComment[] = [
+      {
+        id: "1",
+        databaseId: "100001",
+        body: `Comment text
+        
+Check this: ![description text](image.png)
+[Documentation](https://docs.com "doc title")
+
+Text‌‍with characters
+
+<span aria-label="span label" data-cmd="data value">Visible text</span>`,
+        author: { login: "user1" },
+        createdAt: "2023-01-01T00:00:00Z",
+      },
+    ];
+
+    const result = formatComments(comments);
+
+    expect(result).not.toContain("description text");
+    expect(result).not.toContain("doc title");
+    expect(result).not.toContain("span label");
+    expect(result).not.toContain("data value");
+    expect(result).not.toContain('aria-label="');
+    expect(result).not.toContain('data-cmd="');
+    expect(result).not.toContain("\u200B");
+    expect(result).not.toContain("\u200C");
+    expect(result).not.toContain("\u200D");
+
+    expect(result).toContain("Comment text");
+    expect(result).toContain("![](image.png)");
+    expect(result).toContain("[Documentation](https://docs.com)");
+    expect(result).toContain("Visible text");
+    expect(result).toContain("Textwith characters");
+  });
+
+  it("should handle complex mixed patterns", () => {
+    const content = `
+Text content here.
+
+<div title="divtitletext" data-instruction="data&#32;text">
+  <img src="image.jpg" alt="imgalttext">
+  Text with ‮reversed‬ content
+</div>
+
+![alt text\u200Bwith\u200Ccharacters](image.png)
+
+[link](url.com "title\u00ADtext")
+
+Mix: &#72;idden <span aria-label="&#77;ore">text</span>`;
+
+    const imageUrlMap = new Map<string, string>();
+    const result = formatBody(content, imageUrlMap);
+
+    expect(result).not.toContain('title="');
+    expect(result).not.toContain('data-instruction="');
+    expect(result).not.toContain('alt="');
+    expect(result).not.toContain('aria-label="');
+    expect(result).not.toContain("\u200B");
+    expect(result).not.toContain("\u200C");
+    expect(result).not.toContain("\u00AD");
+    expect(result).not.toContain("\u202E");
+
+    expect(result).toContain("Text content here.");
+    expect(result).toContain("<div>");
+    expect(result).toContain('<img src="image.jpg">');
+    expect(result).toContain("![](image.png)");
+    expect(result).toContain("[link](url.com)");
+    expect(result).toContain("Hidden <span>text</span>");
+  });
+
+  it("should handle edge cases with empty attributes", () => {
+    const edgeCases = `
+<img alt="" src="test.jpg">
+<div title="" data-x="">Content</div>
+![](already-empty.png)
+[link](url.com)
+Normal text`;
+
+    const imageUrlMap = new Map<string, string>();
+    const result = formatBody(edgeCases, imageUrlMap);
+
+    expect(result).toContain('<img src="test.jpg">');
+    expect(result).toContain("<div>Content</div>");
+    expect(result).toContain("![](already-empty.png)");
+    expect(result).toContain("[link](url.com)");
+    expect(result).toContain("Normal text");
+  });
+});
--- a/test/sanitizer.test.ts
+++ b/test/sanitizer.test.ts
@@ -0,0 +1,274 @@
+import { describe, expect, it } from "bun:test";
+import {
+  stripInvisibleCharacters,
+  stripMarkdownImageAltText,
+  stripMarkdownLinkTitles,
+  stripHiddenAttributes,
+  normalizeHtmlEntities,
+  sanitizeContent,
+  stripHtmlComments,
+} from "../src/github/utils/sanitizer";
+
+describe("stripInvisibleCharacters", () => {
+  it("should remove zero-width characters", () => {
+    expect(stripInvisibleCharacters("Hello\u200BWorld")).toBe("HelloWorld");
+    expect(stripInvisibleCharacters("Text\u200C\u200D")).toBe("Text");
+    expect(stripInvisibleCharacters("\uFEFFStart")).toBe("Start");
+  });
+
+  it("should remove control characters", () => {
+    expect(stripInvisibleCharacters("Hello\u0000World")).toBe("HelloWorld");
+    expect(stripInvisibleCharacters("Text\u001F\u007F")).toBe("Text");
+  });
+
+  it("should preserve common whitespace", () => {
+    expect(stripInvisibleCharacters("Hello\nWorld")).toBe("Hello\nWorld");
+    expect(stripInvisibleCharacters("Tab\there")).toBe("Tab\there");
+    expect(stripInvisibleCharacters("Carriage\rReturn")).toBe(
+      "Carriage\rReturn",
+    );
+  });
+
+  it("should remove soft hyphens", () => {
+    expect(stripInvisibleCharacters("Soft\u00ADHyphen")).toBe("SoftHyphen");
+  });
+
+  it("should remove Unicode direction overrides", () => {
+    expect(stripInvisibleCharacters("Text\u202A\u202BMore")).toBe("TextMore");
+    expect(stripInvisibleCharacters("\u2066Isolated\u2069")).toBe("Isolated");
+  });
+});
+
+describe("stripMarkdownImageAltText", () => {
+  it("should remove alt text from markdown images", () => {
+    expect(stripMarkdownImageAltText("![example alt text](image.png)")).toBe(
+      "![](image.png)",
+    );
+    expect(stripMarkdownImageAltText("Text ![description](pic.jpg) more text")).toBe(
+      "Text ![](pic.jpg) more text",
+    );
+  });
+
+  it("should handle multiple images", () => {
+    expect(stripMarkdownImageAltText("![one](1.png) ![two](2.png)")).toBe(
+      "![](1.png) ![](2.png)",
+    );
+  });
+
+  it("should handle empty alt text", () => {
+    expect(stripMarkdownImageAltText("![](image.png)")).toBe("![](image.png)");
+  });
+});
+
+describe("stripMarkdownLinkTitles", () => {
+  it("should remove titles from markdown links", () => {
+    expect(stripMarkdownLinkTitles('[Link](url.com "example title")')).toBe(
+      "[Link](url.com)",
+    );
+    expect(stripMarkdownLinkTitles("[Link](url.com 'example title')")).toBe(
+      "[Link](url.com)",
+    );
+  });
+
+  it("should handle multiple links", () => {
+    expect(
+      stripMarkdownLinkTitles('[One](1.com "first") [Two](2.com "second")'),
+    ).toBe("[One](1.com) [Two](2.com)");
+  });
+
+  it("should preserve links without titles", () => {
+    expect(stripMarkdownLinkTitles("[Link](url.com)")).toBe("[Link](url.com)");
+  });
+});
+
+describe("stripHiddenAttributes", () => {
+  it("should remove alt attributes", () => {
+    expect(stripHiddenAttributes('<img alt="example text" src="pic.jpg">')).toBe(
+      '<img src="pic.jpg">',
+    );
+    expect(stripHiddenAttributes("<img alt='example' src=\"pic.jpg\">")).toBe(
+      '<img src="pic.jpg">',
+    );
+    expect(stripHiddenAttributes('<img alt=example src="pic.jpg">')).toBe(
+      '<img src="pic.jpg">',
+    );
+  });
+
+  it("should remove title attributes", () => {
+    expect(
+      stripHiddenAttributes('<a title="example text" href="#">Link</a>'),
+    ).toBe('<a href="#">Link</a>');
+    expect(stripHiddenAttributes("<div title='example'>Content</div>")).toBe(
+      "<div>Content</div>",
+    );
+  });
+
+  it("should remove aria-label attributes", () => {
+    expect(
+      stripHiddenAttributes('<button aria-label="example">Click</button>'),
+    ).toBe("<button>Click</button>");
+  });
+
+  it("should remove data-* attributes", () => {
+    expect(
+      stripHiddenAttributes(
+        '<div data-test="example" data-info="more example">Text</div>',
+      ),
+    ).toBe("<div>Text</div>");
+  });
+
+  it("should remove placeholder attributes", () => {
+    expect(
+      stripHiddenAttributes('<input placeholder="example text" type="text">'),
+    ).toBe('<input type="text">');
+  });
+
+  it("should handle multiple attributes", () => {
+    expect(
+      stripHiddenAttributes(
+        '<img alt="example" title="test" src="pic.jpg" class="image">',
+      ),
+    ).toBe('<img src="pic.jpg" class="image">');
+  });
+});
+
+describe("normalizeHtmlEntities", () => {
+  it("should decode numeric entities", () => {
+    expect(normalizeHtmlEntities("&#72;&#101;&#108;&#108;&#111;")).toBe(
+      "Hello",
+    );
+    expect(normalizeHtmlEntities("&#65;&#66;&#67;")).toBe("ABC");
+  });
+
+  it("should decode hex entities", () => {
+    expect(normalizeHtmlEntities("&#x48;&#x65;&#x6C;&#x6C;&#x6F;")).toBe(
+      "Hello",
+    );
+    expect(normalizeHtmlEntities("&#x41;&#x42;&#x43;")).toBe("ABC");
+  });
+
+  it("should remove non-printable entities", () => {
+    expect(normalizeHtmlEntities("&#0;&#31;")).toBe("");
+    expect(normalizeHtmlEntities("&#x00;&#x1F;")).toBe("");
+  });
+
+  it("should preserve normal text", () => {
+    expect(normalizeHtmlEntities("Normal text")).toBe("Normal text");
+  });
+});
+
+describe("sanitizeContent", () => {
+  it("should apply all sanitization measures", () => {
+    const testContent = `
+      <!-- This is a comment -->
+      <img alt="example alt text" src="image.jpg">
+      ![example image description](screenshot.png)
+      [click here](https://example.com "example title")
+      <div data-prompt="example data" aria-label="example label">
+        Normal text with hidden\u200Bcharacters
+      </div>
+      &#72;&#105;&#100;&#100;&#101;&#110; message
+    `;
+
+    const sanitized = sanitizeContent(testContent);
+
+    expect(sanitized).not.toContain("example alt text");
+    expect(sanitized).not.toContain("example image description");
+    expect(sanitized).not.toContain("example title");
+    expect(sanitized).not.toContain("example data");
+    expect(sanitized).not.toContain("example label");
+    expect(sanitized).not.toContain("\u200B");
+    expect(sanitized).not.toContain("alt=");
+    expect(sanitized).not.toContain("data-prompt=");
+    expect(sanitized).not.toContain("aria-label=");
+
+    expect(sanitized).toContain("Normal text with hiddencharacters");
+    expect(sanitized).toContain("Hidden message");
+    expect(sanitized).toContain('<img src="image.jpg">');
+    expect(sanitized).toContain("![](screenshot.png)");
+    expect(sanitized).toContain("[click here](https://example.com)");
+  });
+
+  it("should handle complex nested patterns", () => {
+    const complexContent = `
+      Text with ![alt \u200B text](image.png) and more.
+      <a href="#" title="example\u00ADtitle">Link</a>
+      <div data-x="&#72;&#105;">Content</div>
+    `;
+
+    const sanitized = sanitizeContent(complexContent);
+
+    expect(sanitized).not.toContain("\u200B");
+    expect(sanitized).not.toContain("\u00AD");
+    expect(sanitized).not.toContain("alt ");
+    expect(sanitized).not.toContain('title="');
+    expect(sanitized).not.toContain('data-x="');
+    expect(sanitized).toContain("![](image.png)");
+    expect(sanitized).toContain('<a href="#">Link</a>');
+  });
+
+  it("should preserve legitimate markdown and HTML", () => {
+    const legitimateContent = `
+      # Heading
+      
+      This is **bold** and *italic* text.
+      
+      Here's a normal image: ![](normal.jpg)
+      And a normal link: [Click here](https://example.com)
+      
+      <div class="container">
+        <p id="para">Normal paragraph</p>
+        <input type="text" name="field">
+      </div>
+    `;
+
+    const sanitized = sanitizeContent(legitimateContent);
+
+    expect(sanitized).toBe(legitimateContent);
+  });
+
+  it("should handle entity-encoded text", () => {
+    const encodedText = `
+      &#72;&#105;&#100;&#100;&#101;&#110; &#109;&#101;&#115;&#115;&#97;&#103;&#101;
+      <div title="&#101;&#120;&#97;&#109;&#112;&#108;&#101;">Test</div>
+    `;
+
+    const sanitized = sanitizeContent(encodedText);
+
+    expect(sanitized).toContain("Hidden message");
+    expect(sanitized).not.toContain('title="');
+    expect(sanitized).toContain("<div>Test</div>");
+  });
+
+  it("should handle mixed input patterns", () => {
+    const mixedInput = `
+      ![example\u200Btext\u00ADwith\u00ADcharacters](image.png)
+      <img alt="&#101;&#120;&#97;&#109;&#112;&#108;&#101;" src="pic.jpg">
+      [link](url.com "title\u202Ewith\u202Ccharacters")
+      <span data-cmd="data value" aria-label="label text">visible text</span>
+    `;
+
+    const sanitized = sanitizeContent(mixedInput);
+
+    expect(sanitized).not.toContain("example");
+    expect(sanitized).not.toContain("characters");
+    expect(sanitized).not.toContain("title");
+    expect(sanitized).not.toContain("data value");
+    expect(sanitized).not.toContain("label text");
+    expect(sanitized).toContain("visible text");
+  });
+});
+
+describe("stripHtmlComments (legacy)", () => {
+  it("should remove HTML comments", () => {
+    expect(stripHtmlComments("Hello <!-- example -->World")).toBe("Hello World");
+    expect(stripHtmlComments("<!-- comment -->Text")).toBe("Text");
+    expect(stripHtmlComments("Text<!-- comment -->")).toBe("Text");
+  });
+
+  it("should handle multiline comments", () => {
+    expect(stripHtmlComments("Hello <!-- \nexample\n -->World")).toBe(
+      "Hello World",
+    );
+  });
+});
				`@@ -0,0 +1 @@`
				`{"message":"Problems parsing JSON","documentation_url":"https://docs.github.com/rest/markdown/markdown#render-a-markdown-document","status":"400"}`