@@ -14524,7 +14524,7 @@ ul.messages {
1452414524 break;
1452514525 case "UNKNOWN":
1452614526 default:
14527- console.warn ("No known pageType");
14527+ logger.log ("No known pageType");
1452814528 }
1452914529 if (this.currentPage) {
1453014530 this.currentPage.destroy();
@@ -21829,6 +21829,20 @@ ${truncatedWarning}
2182921829 return node.nodeType === Node.ELEMENT_NODE;
2183021830 }
2183121831 function getSameOriginIframeDocument(iframe) {
21832+ const src = iframe.src;
21833+ if (iframe.hasAttribute("sandbox") && !iframe.sandbox.contains("allow-scripts")) {
21834+ return null;
21835+ }
21836+ if (src && src !== "about:blank" && src !== "") {
21837+ try {
21838+ const iframeUrl = new URL(src, window.location.href);
21839+ if (iframeUrl.origin !== window.location.origin) {
21840+ return null;
21841+ }
21842+ } catch (e) {
21843+ return null;
21844+ }
21845+ }
2183221846 try {
2183321847 const doc = iframe.contentDocument;
2183421848 if (doc && doc.documentElement) {
@@ -21864,7 +21878,7 @@ ${truncatedWarning}
2186421878 if (!isHtmlElement(node)) {
2186521879 return "";
2186621880 }
21867- if (!checkNodeIsVisible(node) || node.matches(settings.excludeSelectors)) {
21881+ if (!checkNodeIsVisible(node) || settings.excludeSelectors && node.matches(settings.excludeSelectors)) {
2186821882 return "";
2186921883 }
2187021884 const tag = node.tagName.toLowerCase();
@@ -21896,17 +21910,22 @@ ${truncatedWarning}
2189621910`;
2189721911 case "br":
2189821912 return `
21913+ `;
21914+ case "img":
21915+ return `
21916+ })
2189921917`;
2190021918 case "ul":
21919+ case "ol":
2190121920 return `
2190221921${children}
2190321922`;
2190421923 case "li":
2190521924 return `
21906- - ${children.trim( )}
21925+ - ${collapseAndTrim(children )}
2190721926`;
2190821927 case "a":
21909- return getLinkText(node);
21928+ return getLinkText(node, children, settings );
2191021929 case "iframe": {
2191121930 if (!settings.includeIframes) {
2191221931 return children;
@@ -21931,12 +21950,20 @@ ${iframeContent}
2193121950 return children;
2193221951 }
2193321952 }
21953+ function getAttributeOrBlank(node, attr) {
21954+ const attrValue = node.getAttribute(attr) ?? "";
21955+ return attrValue.trim();
21956+ }
2193421957 function collapseAndTrim(str) {
2193521958 return collapseWhitespace(str).trim();
2193621959 }
21937- function getLinkText(node) {
21960+ function getLinkText(node, children, settings ) {
2193821961 const href = node.getAttribute("href");
21939- return href ? `[${collapseAndTrim(node.textContent)}](${href})` : collapseWhitespace(node.textContent);
21962+ const trimmedContent = collapseAndTrim(children);
21963+ if (settings.trimBlankLinks && trimmedContent.length === 0) {
21964+ return "";
21965+ }
21966+ return href ? `[${trimmedContent}](${href})` : collapseWhitespace(children);
2194021967 }
2194121968 var _cachedContent, _cachedTimestamp, _delayedRecheckTimer;
2194221969 var PageContext = class extends ContentFeature {
@@ -22167,6 +22194,8 @@ ${iframeContent}
2216722194 const maxDepth = this.getFeatureSetting("maxDepth") || 5e3;
2216822195 let excludeSelectors = this.getFeatureSetting("excludeSelectors") || [".ad", ".sidebar", ".footer", ".nav", ".header"];
2216922196 const excludedInertElements = this.getFeatureSetting("excludedInertElements") || [
22197+ "img",
22198+ // Note we're currently disabling images which we're handling in domToMarkdown (this can be per-site enabled in the config if needed).
2217022199 "script",
2217122200 "style",
2217222201 "link",
@@ -22184,18 +22213,26 @@ ${iframeContent}
2218422213 if (mainContent && mainContent.innerHTML.trim().length <= mainContentLength) {
2218522214 mainContent = null;
2218622215 }
22187- const contentRoot = mainContent || document.body;
22188- if (contentRoot) {
22189- this.log.info("Getting main content", contentRoot );
22190- content + = domToMarkdown(contentRoot , {
22216+ let contentRoot = mainContent || document.body;
22217+ const extractContent = (root) => {
22218+ this.log.info("Getting content", root );
22219+ const result = domToMarkdown(root , {
2219122220 maxLength: upperLimit,
2219222221 maxDepth,
2219322222 includeIframes: this.getFeatureSettingEnabled("includeIframes", "enabled"),
22194- excludeSelectors: excludeSelectorsString
22195- });
22196- this.log.info("Content markdown", content, contentRoot);
22223+ excludeSelectors: excludeSelectorsString,
22224+ trimBlankLinks: this.getFeatureSettingEnabled("trimBlankLinks", "enabled")
22225+ }).trim();
22226+ this.log.info("Content markdown", result, root);
22227+ return result;
22228+ };
22229+ if (contentRoot) {
22230+ content += extractContent(contentRoot);
22231+ }
22232+ if (content.length === 0 && contentRoot !== document.body && this.getFeatureSettingEnabled("bodyFallback", "enabled")) {
22233+ contentRoot = document.body;
22234+ content += extractContent(contentRoot);
2219722235 }
22198- content = content.trim();
2219922236 this.fullContentLength = content.length;
2220022237 if (content.length > maxLength) {
2220122238 this.log.info("Truncating content", {
0 commit comments