duckduckgo · jonathanKingston · Oct 10, 2025 · Oct 11, 2025 · Oct 16, 2025 · Oct 16, 2025
@@ -19,3 +19,4 @@ injected/unit-test/fixtures/page-context/output/
 .netlify
 # VS Code user config
 .vscode
+page-context-collector/
@@ -2,6 +2,54 @@
 import { TestTransportConfig } from '../../messaging/index.js';
 import { getTabUrl } from '../src/utils.js';
 
+// Initialize the test harness global immediately when script loads
+/** @type {Map<string, (d: any)=>void>} */
+const globalSubscriptions = new Map();
+
+if (!window.__playwright_01) {
+    window.__playwright_01 = {
+        mockResponses: {},
+        subscriptionEvents: [],
+        mocks: {
+            outgoing: [],
+        },
+        publishSubscriptionEvent: (evt) => {
+            const matchingCallback = globalSubscriptions.get(evt.subscriptionName);
+            if (!matchingCallback) {
+                console.error('no matching callback for subscription', evt);
+                return;
+            }
+            matchingCallback(evt.params);
+        }
+    };
+}
+
+/**
+ * Create a mock transport for extension testing that integrates with the test harness
+ * Following the same pattern as special-pages mock transports
+ */
+function createExtensionMockTransport() {
+
+    return new TestTransportConfig({
+        notify(_msg) {
+            window.__playwright_01?.mocks?.outgoing?.push?.({ payload: structuredClone(_msg) });
+        },
+        request: async (_msg) => {
+            window.__playwright_01?.mocks?.outgoing?.push?.({ payload: structuredClone(_msg) });
+            // Return empty response for testing
+            return {};
+        },
+        subscribe(_msg, callback) {
+            window.__playwright_01?.mocks?.outgoing?.push?.({ payload: structuredClone(_msg) });
+            // Register the subscription with the test harness (same pattern as special pages)
+            globalSubscriptions.set(_msg.subscriptionName, callback);
+            return () => {
+                globalSubscriptions.delete(_msg.subscriptionName);
+            };
+        },
+    });
+}
+
 function generateConfig() {
     const topLevelUrl = getTabUrl();
     return {
@@ -36,6 +84,7 @@
                 'apiManipulation',
                 'duckPlayer',
                 'duckPlayerNative',
+                'pageContext',
             ],
         },
     };
@@ -78,19 +127,7 @@
     const processedConfig = generateConfig();
 
     // mock Messaging and allow for tests to intercept them
-    globalThis.cssMessaging = processedConfig.messagingConfig = new TestTransportConfig({
-        notify() {
-            // noop
-        },
-        request: async () => {
-            // noop
-        },
-        subscribe() {
-            return () => {
-                // noop
-            };
-        },
-    });
+    globalThis.cssMessaging = processedConfig.messagingConfig = createExtensionMockTransport();
 
     load({
         // @ts-expect-error Types of property 'name' are incompatible.

@@ -0,0 +1,129 @@
+# Page Context Content Collection
+
+This module provides utilities for collecting web page content using DuckDuckGo's content-scope-scripts page-context feature with Playwright.
+
+## Features
+
+- Extract structured content from web pages (title, headings, links, images, text content)
+- Configure content extraction settings (length limits, excluded selectors, etc.)
+- Handle content caching and truncation
+- Work with the existing fake extension infrastructure
+
+## Quick Start
+
+### Basic Usage
+
+```javascript
+import { PageContextCollector, pageContextTest } from './helpers/page-context-collector.js';
+import { test } from '@playwright/test';
+
+const testBase = pageContextTest(test);
+
+testBase('collect page content', async ({ page }, testInfo) => {
+    const collector = PageContextCollector.create(page, testInfo);
+
+    const content = await collector.loadAndCollect('https://example.com');
+
+    console.log('Title:', content.title);
+    console.log('Content:', content.content);
+    console.log('Headings:', content.headings);
+});
+```
+
+### Advanced Configuration
+
+```javascript
+const collector = PageContextCollector.create(page, testInfo);
+
+collector.withPageContextSettings({
+    maxContentLength: 5000,
+    maxTitleLength: 100,
+    excludeSelectors: ['.ad', '.sidebar', '.navigation'],
+    subscribeToCollect: { enabled: true },
+    cacheExpiration: 30000
+});
+
+const content = await collector.loadAndCollect('https://example.com');
+```
+
+### Standalone Example
+
+Run the example script to test content extraction:
+
+```bash
+cd content-scope-scripts/injected
+node integration-test/page-context-example.js https://example.com
+```
+
+## API Reference
+
+### PageContextCollector
+
+#### Methods
+
+- `static create(page, testInfo)` - Create a new collector instance
+- `withPageContextSettings(options)` - Configure page-context feature settings
+- `loadUrl(url)` - Load a URL and initialize page-context
+- `collectPageContext()` - Trigger content collection and return results
+- `loadAndCollect(url, settings?)` - Load URL and collect content in one step
+
+#### Content Structure
+
+The collected content object contains:
+
+```javascript
+{
+    title: string,           // Page title
+    metaDescription: string, // Meta description
+    content: string,         // Main content as markdown
+    truncated: boolean,      // Whether content was truncated
+    fullContentLength: number, // Original content length
+    headings: Array<{level: number, text: string}>,
+    links: Array<{text: string, href: string}>,
+    images: Array<{src: string, alt: string}>,
+    favicon: string,         // Favicon URL
+    url: string,            // Page URL
+    timestamp: number       // Collection timestamp
+}
+```
+
+#### Configuration Options
+
+```javascript
+{
+    subscribeToCollect: { enabled: boolean },
+    subscribeToHashChange: { enabled: boolean },
+    subscribeToPageShow: { enabled: boolean },
+    subscribeToVisibilityChange: { enabled: boolean },
+    maxContentLength: number,        // Default: 9500
+    maxTitleLength: number,          // Default: 100
+    cacheExpiration: number,         // Default: 30000ms
+    recheckLimit: number,            // Default: 5
+    excludeSelectors: string[]       // CSS selectors to exclude
+}
+```
+
+## Testing
+
+Run the Playwright tests:
+
+```bash
+cd content-scope-scripts/injected
+npm run playwright -- page-context-collection.spec.js
+```
+
+## Implementation Details
+
+The implementation leverages:
+
+- Content-scope-scripts' existing `page-context` feature
+- Playwright's browser extension support via the fake extension
+- The `simulateSubscriptionMessage` testing utility
+- Direct messaging without requiring background scripts
+
+The page-context feature automatically:
+- Converts DOM content to markdown format
+- Handles content truncation and caching
+- Extracts structured data (headings, links, images)
+- Respects visibility and exclusion rules
+- Provides performance monitoring
@@ -0,0 +1,136 @@
+import { testContextForExtension } from './harness.js';
+import { ResultsCollector } from '../page-objects/results-collector.js';
+
+/**
+ * A utility for collecting page content using the page-context feature
+ */
+export class PageContextCollector {
+    /**
+     * @param {import("@playwright/test").Page} page
+     * @param {import("@playwright/test").TestInfo} testInfo
+     * @param {string} [configPath] - Optional config file path
+     */
+    constructor(page, testInfo, configPath) {
+        this.page = page;
+        this.testInfo = testInfo;
+        this.collector = ResultsCollector.create(page, testInfo.project.use);
+        this.configPath = configPath || './integration-test/test-pages/page-context/config/page-context-enabled.json';
+    }
+
+    /**
+     * Create a PageContextCollector instance
+     * @param {import("@playwright/test").Page} page
+     * @param {import("@playwright/test").TestInfo} testInfo
+     * @param {string} [configPath] - Optional config file path
+     * @returns {PageContextCollector}
+     */
+    static create(page, testInfo, configPath) {
+        return new PageContextCollector(page, testInfo, configPath);
+    }
+
+    /**
+     * Configure page-context feature settings by using a custom config file
+     * This method is kept for API compatibility but the actual configuration
+     * should be done via the JSON config file.
+     * @param {Object} options - Configuration options (for documentation purposes)
+     * @param {boolean} [options.subscribeToCollect=true] - Enable collect subscription
+     * @param {boolean} [options.subscribeToHashChange=true] - Enable hashchange listener
+     * @param {boolean} [options.subscribeToPageShow=true] - Enable pageshow listener
+     * @param {boolean} [options.subscribeToVisibilityChange=true] - Enable visibility change listener
+     * @param {number} [options.maxContentLength=9500] - Maximum content length
+     * @param {number} [options.maxTitleLength=100] - Maximum title length
+     * @param {number} [options.cacheExpiration=30000] - Cache expiration in ms
+     * @param {number} [options.recheckLimit=5] - Maximum recheck attempts
+     * @param {string[]} [options.excludeSelectors] - CSS selectors to exclude
+     * @returns {PageContextCollector}
+     */
+    withPageContextSettings(options = {}) {
+        // For now, we use the JSON config file for settings
+        // In the future, this could be enhanced to dynamically modify the config
+        console.log('Page context settings configured via JSON config file');
+        return this;
+    }
+
+    /**
+     * Load a URL and wait for page-context initialization
+     * @param {string} url - The URL to load
+     * @returns {Promise<void>}
+     */
+    async loadUrl(url) {
+        console.log('Loading URL');
+        // Load the page with the page-context config
+        await this.collector.load(url, this.configPath);
+        console.log('URL loaded');
+        // Wait for page load to complete
+        await this.page.waitForLoadState('networkidle');
+    }
+
+    /**
+     * Trigger page context collection
+     * @returns {Promise<Object>} - The collected page content
+     */
+    async collectPageContext() {
+        // Trigger the collect subscription message
+        await this.collector.simulateSubscriptionMessage('pageContext', 'collect', {});
+
+        // Wait for and capture the collectionResult response
+        const messages = await this.collector.waitForMessage('collectionResult', 1);
+
+        if (!messages || messages.length === 0) {
+            throw new Error('No collectionResult message received');
+        }
+
+        const message = messages[0];
+
+        // Parse the serialized page data
+        if (message.payload && message.payload.params && message.payload.params.serializedPageData) {
+            return JSON.parse(message.payload.params.serializedPageData);
+        }
+
+        throw new Error('Invalid collectionResult message format');
+    }
+
+    /**
+     * Load a URL and collect its page context in one step
+     * @param {string} url - The URL to load and collect content from
+     * @param {Object} [pageContextSettings] - Optional page-context settings (for API compatibility)
+     * @returns {Promise<Object>} - The collected page content
+     */
+    async loadAndCollect(url, pageContextSettings = {}) {
+        if (Object.keys(pageContextSettings).length > 0) {
+            this.withPageContextSettings(pageContextSettings);
+        }
+        await this.loadUrl(url);
+        return await this.collectPageContext();
+    }
+
+    /**
+     * Get the underlying ResultsCollector instance for advanced usage
+     * @returns {ResultsCollector}
+     */
+    getCollector() {
+        return this.collector;
+    }
+}
+
+/**
+ * Create a Playwright test context configured for page-context collection
+ * @param {import("@playwright/test").test} test - The base test object
+ * @returns {import("@playwright/test").test} - The configured test object
+ */
+export function pageContextTest(test) {
+    return testContextForExtension(test);
+}
+
+/**
+ * Convenience function to collect page content from a URL
+ * @param {import("@playwright/test").Page} page
+ * @param {import("@playwright/test").TestInfo} testInfo
+ * @param {string} url
+ * @param {Object} [options] - Optional page-context settings
+ * @returns {Promise<Object>} - The collected page content
+ */
+export async function collectPageContent(page, testInfo, url, options = {}) {
+    const collector = PageContextCollector.create(page, testInfo);
+    return await collector.loadAndCollect(url, options);
+}