Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
The table of contents is too big for display.
Diff view
Diff view
  •  
  •  
  •  
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -19,3 +19,4 @@ injected/unit-test/fixtures/page-context/output/
.netlify
# VS Code user config
.vscode
page-context-collector/
63 changes: 50 additions & 13 deletions injected/entry-points/integration.js
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,54 @@
import { TestTransportConfig } from '../../messaging/index.js';
import { getTabUrl } from '../src/utils.js';

// Initialize the test harness global immediately when script loads
/** @type {Map<string, (d: any)=>void>} */
const globalSubscriptions = new Map();

if (!window.__playwright_01) {
window.__playwright_01 = {
mockResponses: {},
subscriptionEvents: [],
mocks: {
outgoing: [],
},
publishSubscriptionEvent: (evt) => {
const matchingCallback = globalSubscriptions.get(evt.subscriptionName);
if (!matchingCallback) {
console.error('no matching callback for subscription', evt);
return;
}
matchingCallback(evt.params);
}
};
}

/**
* Create a mock transport for extension testing that integrates with the test harness
* Following the same pattern as special-pages mock transports
*/
function createExtensionMockTransport() {

return new TestTransportConfig({
notify(_msg) {
window.__playwright_01?.mocks?.outgoing?.push?.({ payload: structuredClone(_msg) });
},
request: async (_msg) => {

Check failure on line 37 in injected/entry-points/integration.js

View workflow job for this annotation

GitHub Actions / unit (ubuntu-latest)

Async method 'request' has no 'await' expression

Check failure on line 37 in injected/entry-points/integration.js

View workflow job for this annotation

GitHub Actions / snapshots

Async method 'request' has no 'await' expression
window.__playwright_01?.mocks?.outgoing?.push?.({ payload: structuredClone(_msg) });
// Return empty response for testing
return {};
},
subscribe(_msg, callback) {
window.__playwright_01?.mocks?.outgoing?.push?.({ payload: structuredClone(_msg) });
// Register the subscription with the test harness (same pattern as special pages)
globalSubscriptions.set(_msg.subscriptionName, callback);
return () => {
globalSubscriptions.delete(_msg.subscriptionName);
};
},
});
}

function generateConfig() {
const topLevelUrl = getTabUrl();
return {
Expand Down Expand Up @@ -36,6 +84,7 @@
'apiManipulation',
'duckPlayer',
'duckPlayerNative',
'pageContext',
],
},
};
Expand Down Expand Up @@ -78,19 +127,7 @@
const processedConfig = generateConfig();

// mock Messaging and allow for tests to intercept them
globalThis.cssMessaging = processedConfig.messagingConfig = new TestTransportConfig({
notify() {
// noop
},
request: async () => {
// noop
},
subscribe() {
return () => {
// noop
};
},
});
globalThis.cssMessaging = processedConfig.messagingConfig = createExtensionMockTransport();

load({
// @ts-expect-error Types of property 'name' are incompatible.
Expand Down
129 changes: 129 additions & 0 deletions injected/integration-test/README-page-context.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
# Page Context Content Collection

This module provides utilities for collecting web page content using DuckDuckGo's content-scope-scripts page-context feature with Playwright.

## Features

- Extract structured content from web pages (title, headings, links, images, text content)
- Configure content extraction settings (length limits, excluded selectors, etc.)
- Handle content caching and truncation
- Work with the existing fake extension infrastructure

## Quick Start

### Basic Usage

```javascript
import { PageContextCollector, pageContextTest } from './helpers/page-context-collector.js';
import { test } from '@playwright/test';

const testBase = pageContextTest(test);

testBase('collect page content', async ({ page }, testInfo) => {
const collector = PageContextCollector.create(page, testInfo);

const content = await collector.loadAndCollect('https://example.com');

console.log('Title:', content.title);
console.log('Content:', content.content);
console.log('Headings:', content.headings);
});
```

### Advanced Configuration

```javascript
const collector = PageContextCollector.create(page, testInfo);

collector.withPageContextSettings({
maxContentLength: 5000,
maxTitleLength: 100,
excludeSelectors: ['.ad', '.sidebar', '.navigation'],
subscribeToCollect: { enabled: true },
cacheExpiration: 30000
});

const content = await collector.loadAndCollect('https://example.com');
```

### Standalone Example

Run the example script to test content extraction:

```bash
cd content-scope-scripts/injected
node integration-test/page-context-example.js https://example.com
```

## API Reference

### PageContextCollector

#### Methods

- `static create(page, testInfo)` - Create a new collector instance
- `withPageContextSettings(options)` - Configure page-context feature settings
- `loadUrl(url)` - Load a URL and initialize page-context
- `collectPageContext()` - Trigger content collection and return results
- `loadAndCollect(url, settings?)` - Load URL and collect content in one step

#### Content Structure

The collected content object contains:

```javascript
{
title: string, // Page title
metaDescription: string, // Meta description
content: string, // Main content as markdown
truncated: boolean, // Whether content was truncated
fullContentLength: number, // Original content length
headings: Array<{level: number, text: string}>,
links: Array<{text: string, href: string}>,
images: Array<{src: string, alt: string}>,
favicon: string, // Favicon URL
url: string, // Page URL
timestamp: number // Collection timestamp
}
```

#### Configuration Options

```javascript
{
subscribeToCollect: { enabled: boolean },
subscribeToHashChange: { enabled: boolean },
subscribeToPageShow: { enabled: boolean },
subscribeToVisibilityChange: { enabled: boolean },
maxContentLength: number, // Default: 9500
maxTitleLength: number, // Default: 100
cacheExpiration: number, // Default: 30000ms
recheckLimit: number, // Default: 5
excludeSelectors: string[] // CSS selectors to exclude
}
```

## Testing

Run the Playwright tests:

```bash
cd content-scope-scripts/injected
npm run playwright -- page-context-collection.spec.js
```

## Implementation Details

The implementation leverages:

- Content-scope-scripts' existing `page-context` feature
- Playwright's browser extension support via the fake extension
- The `simulateSubscriptionMessage` testing utility
- Direct messaging without requiring background scripts

The page-context feature automatically:
- Converts DOM content to markdown format
- Handles content truncation and caching
- Extracts structured data (headings, links, images)
- Respects visibility and exclusion rules
- Provides performance monitoring
136 changes: 136 additions & 0 deletions injected/integration-test/helpers/page-context-collector.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
import { testContextForExtension } from './harness.js';
import { ResultsCollector } from '../page-objects/results-collector.js';

/**
* A utility for collecting page content using the page-context feature
*/
export class PageContextCollector {
/**
* @param {import("@playwright/test").Page} page
* @param {import("@playwright/test").TestInfo} testInfo
* @param {string} [configPath] - Optional config file path
*/
constructor(page, testInfo, configPath) {
this.page = page;
this.testInfo = testInfo;
this.collector = ResultsCollector.create(page, testInfo.project.use);
this.configPath = configPath || './integration-test/test-pages/page-context/config/page-context-enabled.json';
}

/**
* Create a PageContextCollector instance
* @param {import("@playwright/test").Page} page
* @param {import("@playwright/test").TestInfo} testInfo
* @param {string} [configPath] - Optional config file path
* @returns {PageContextCollector}
*/
static create(page, testInfo, configPath) {
return new PageContextCollector(page, testInfo, configPath);
}

/**
* Configure page-context feature settings by using a custom config file
* This method is kept for API compatibility but the actual configuration
* should be done via the JSON config file.
* @param {Object} options - Configuration options (for documentation purposes)
* @param {boolean} [options.subscribeToCollect=true] - Enable collect subscription
* @param {boolean} [options.subscribeToHashChange=true] - Enable hashchange listener
* @param {boolean} [options.subscribeToPageShow=true] - Enable pageshow listener
* @param {boolean} [options.subscribeToVisibilityChange=true] - Enable visibility change listener
* @param {number} [options.maxContentLength=9500] - Maximum content length
* @param {number} [options.maxTitleLength=100] - Maximum title length
* @param {number} [options.cacheExpiration=30000] - Cache expiration in ms
* @param {number} [options.recheckLimit=5] - Maximum recheck attempts
* @param {string[]} [options.excludeSelectors] - CSS selectors to exclude
* @returns {PageContextCollector}
*/
withPageContextSettings(options = {}) {
// For now, we use the JSON config file for settings
// In the future, this could be enhanced to dynamically modify the config
console.log('Page context settings configured via JSON config file');
return this;
}

/**
* Load a URL and wait for page-context initialization
* @param {string} url - The URL to load
* @returns {Promise<void>}
*/
async loadUrl(url) {
console.log('Loading URL');
// Load the page with the page-context config
await this.collector.load(url, this.configPath);
console.log('URL loaded');
// Wait for page load to complete
await this.page.waitForLoadState('networkidle');
}

/**
* Trigger page context collection
* @returns {Promise<Object>} - The collected page content
*/
async collectPageContext() {
// Trigger the collect subscription message
await this.collector.simulateSubscriptionMessage('pageContext', 'collect', {});

// Wait for and capture the collectionResult response
const messages = await this.collector.waitForMessage('collectionResult', 1);

if (!messages || messages.length === 0) {
throw new Error('No collectionResult message received');
}

const message = messages[0];

// Parse the serialized page data
if (message.payload && message.payload.params && message.payload.params.serializedPageData) {
return JSON.parse(message.payload.params.serializedPageData);
}

throw new Error('Invalid collectionResult message format');
}

/**
* Load a URL and collect its page context in one step
* @param {string} url - The URL to load and collect content from
* @param {Object} [pageContextSettings] - Optional page-context settings (for API compatibility)
* @returns {Promise<Object>} - The collected page content
*/
async loadAndCollect(url, pageContextSettings = {}) {
if (Object.keys(pageContextSettings).length > 0) {
this.withPageContextSettings(pageContextSettings);
}
await this.loadUrl(url);
return await this.collectPageContext();
}

/**
* Get the underlying ResultsCollector instance for advanced usage
* @returns {ResultsCollector}
*/
getCollector() {
return this.collector;
}
}

/**
* Create a Playwright test context configured for page-context collection
* @param {import("@playwright/test").test} test - The base test object
* @returns {import("@playwright/test").test} - The configured test object
*/
export function pageContextTest(test) {
return testContextForExtension(test);
}

/**
* Convenience function to collect page content from a URL
* @param {import("@playwright/test").Page} page
* @param {import("@playwright/test").TestInfo} testInfo
* @param {string} url
* @param {Object} [options] - Optional page-context settings
* @returns {Promise<Object>} - The collected page content
*/
export async function collectPageContent(page, testInfo, url, options = {}) {
const collector = PageContextCollector.create(page, testInfo);
return await collector.loadAndCollect(url, options);
}
Loading
Loading