Skip to content

Commit 656f6ca

Browse files
Feature/Spider (open-source web scraper & crawler) (#2738)
* Add Spider Scraper & Crawler * fix pnpm lint * chore: Update metadata to be correct format * fix pnpm lint
1 parent efc6e02 commit 656f6ca

File tree

4 files changed

+317
-0
lines changed

4 files changed

+317
-0
lines changed
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
import { INodeParams, INodeCredential } from '../src/Interface'
2+
3+
class SpiderApiCredential implements INodeCredential {
4+
label: string
5+
name: string
6+
version: number
7+
description: string
8+
inputs: INodeParams[]
9+
10+
constructor() {
11+
this.label = 'Spider API'
12+
this.name = 'spiderApi'
13+
this.version = 1.0
14+
this.description = 'Get your API key from the <a target="_blank" href="https://spider.cloud">Spider</a> dashboard.'
15+
this.inputs = [
16+
{
17+
label: 'Spider API Key',
18+
name: 'spiderApiKey',
19+
type: 'password'
20+
}
21+
]
22+
}
23+
}
24+
25+
module.exports = { credClass: SpiderApiCredential }
Lines changed: 175 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,175 @@
1+
import { TextSplitter } from 'langchain/text_splitter'
2+
import { Document, DocumentInterface } from '@langchain/core/documents'
3+
import { BaseDocumentLoader } from 'langchain/document_loaders/base'
4+
import { INode, INodeData, INodeParams, ICommonObject } from '../../../src/Interface'
5+
import { getCredentialData, getCredentialParam } from '../../../src/utils'
6+
import SpiderApp from './SpiderApp'
7+
8+
interface SpiderLoaderParameters {
9+
url: string
10+
apiKey?: string
11+
mode?: 'crawl' | 'scrape'
12+
params?: Record<string, unknown>
13+
}
14+
15+
class SpiderLoader extends BaseDocumentLoader {
16+
private apiKey: string
17+
private url: string
18+
private mode: 'crawl' | 'scrape'
19+
private params?: Record<string, unknown>
20+
21+
constructor(loaderParams: SpiderLoaderParameters) {
22+
super()
23+
const { apiKey, url, mode = 'crawl', params } = loaderParams
24+
if (!apiKey) {
25+
throw new Error('Spider API key not set. You can set it as SPIDER_API_KEY in your .env file, or pass it to Spider.')
26+
}
27+
28+
this.apiKey = apiKey
29+
this.url = url
30+
this.mode = mode
31+
this.params = params
32+
}
33+
34+
public async load(): Promise<DocumentInterface[]> {
35+
const app = new SpiderApp({ apiKey: this.apiKey })
36+
let spiderDocs: any[]
37+
38+
if (this.mode === 'scrape') {
39+
const response = await app.scrapeUrl(this.url, this.params)
40+
if (!response.success) {
41+
throw new Error(`Spider: Failed to scrape URL. Error: ${response.error}`)
42+
}
43+
spiderDocs = [response.data]
44+
} else if (this.mode === 'crawl') {
45+
const response = await app.crawlUrl(this.url, this.params)
46+
if (!response.success) {
47+
throw new Error(`Spider: Failed to crawl URL. Error: ${response.error}`)
48+
}
49+
spiderDocs = response.data
50+
} else {
51+
throw new Error(`Unrecognized mode '${this.mode}'. Expected one of 'crawl', 'scrape'.`)
52+
}
53+
54+
return spiderDocs.map(
55+
(doc) =>
56+
new Document({
57+
pageContent: doc.content || '',
58+
metadata: { source: doc.url }
59+
})
60+
)
61+
}
62+
}
63+
64+
class Spider_DocumentLoaders implements INode {
65+
label: string
66+
name: string
67+
description: string
68+
type: string
69+
icon: string
70+
version: number
71+
category: string
72+
baseClasses: string[]
73+
inputs: INodeParams[]
74+
credential: INodeParams
75+
76+
constructor() {
77+
this.label = 'Spider Document Loaders'
78+
this.name = 'spiderDocumentLoaders'
79+
this.version = 1.0
80+
this.type = 'Document'
81+
this.icon = 'spider.svg'
82+
this.category = 'Document Loaders'
83+
this.description = 'Scrape & Crawl the web with Spider'
84+
this.baseClasses = [this.type]
85+
this.inputs = [
86+
{
87+
label: 'Text Splitter',
88+
name: 'textSplitter',
89+
type: 'TextSplitter',
90+
optional: true
91+
},
92+
{
93+
label: 'Mode',
94+
name: 'mode',
95+
type: 'options',
96+
options: [
97+
{
98+
label: 'Scrape',
99+
name: 'scrape',
100+
description: 'Scrape a single page'
101+
},
102+
{
103+
label: 'Crawl',
104+
name: 'crawl',
105+
description: 'Crawl a website and extract pages within the same domain'
106+
}
107+
],
108+
default: 'scrape'
109+
},
110+
{
111+
label: 'Web Page URL',
112+
name: 'url',
113+
type: 'string',
114+
placeholder: 'https://spider.cloud'
115+
},
116+
{
117+
label: 'Additional Parameters',
118+
name: 'params',
119+
description:
120+
'Find all the available parameters in the <a _target="blank" href="https://spider.cloud/docs/api">Spider API documentation</a>',
121+
additionalParams: true,
122+
placeholder: '{ "anti_bot": true }',
123+
type: 'json',
124+
optional: true
125+
}
126+
]
127+
this.credential = {
128+
label: 'Credential',
129+
name: 'credential',
130+
type: 'credential',
131+
credentialNames: ['spiderApi']
132+
}
133+
}
134+
135+
async init(nodeData: INodeData, _: string, options: ICommonObject): Promise<any> {
136+
const textSplitter = nodeData.inputs?.textSplitter as TextSplitter
137+
const url = nodeData.inputs?.url as string
138+
const mode = nodeData.inputs?.mode as 'crawl' | 'scrape'
139+
let params = nodeData.inputs?.params || {}
140+
const credentialData = await getCredentialData(nodeData.credential ?? '', options)
141+
const spiderApiKey = getCredentialParam('spiderApiKey', credentialData, nodeData)
142+
143+
if (typeof params === 'string') {
144+
try {
145+
params = JSON.parse(params)
146+
} catch (e) {
147+
throw new Error('Invalid JSON string provided for params')
148+
}
149+
}
150+
151+
// Ensure return_format is set to markdown
152+
params.return_format = 'markdown'
153+
154+
const input: SpiderLoaderParameters = {
155+
url,
156+
mode: mode as 'crawl' | 'scrape',
157+
apiKey: spiderApiKey,
158+
params: params as Record<string, unknown>
159+
}
160+
161+
const loader = new SpiderLoader(input)
162+
163+
let docs = []
164+
165+
if (textSplitter) {
166+
docs = await loader.loadAndSplit(textSplitter)
167+
} else {
168+
docs = await loader.load()
169+
}
170+
171+
return docs
172+
}
173+
}
174+
175+
module.exports = { nodeClass: Spider_DocumentLoaders }
Lines changed: 116 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,116 @@
1+
import axios, { AxiosResponse, AxiosRequestHeaders } from 'axios'
2+
3+
interface SpiderAppConfig {
4+
apiKey?: string | null
5+
apiUrl?: string | null
6+
}
7+
8+
interface SpiderDocumentMetadata {
9+
title?: string
10+
description?: string
11+
language?: string
12+
[key: string]: any
13+
}
14+
15+
interface SpiderDocument {
16+
id?: string
17+
url?: string
18+
content: string
19+
markdown?: string
20+
html?: string
21+
createdAt?: Date
22+
updatedAt?: Date
23+
type?: string
24+
metadata: SpiderDocumentMetadata
25+
}
26+
27+
interface ScrapeResponse {
28+
success: boolean
29+
data?: SpiderDocument
30+
error?: string
31+
}
32+
33+
interface CrawlResponse {
34+
success: boolean
35+
data?: SpiderDocument[]
36+
error?: string
37+
}
38+
39+
interface Params {
40+
[key: string]: any
41+
}
42+
43+
class SpiderApp {
44+
private apiKey: string
45+
private apiUrl: string
46+
47+
constructor({ apiKey = null, apiUrl = null }: SpiderAppConfig) {
48+
this.apiKey = apiKey || ''
49+
this.apiUrl = apiUrl || 'https://api.spider.cloud/v1'
50+
if (!this.apiKey) {
51+
throw new Error('No API key provided')
52+
}
53+
}
54+
55+
async scrapeUrl(url: string, params: Params | null = null): Promise<ScrapeResponse> {
56+
const headers = this.prepareHeaders()
57+
const jsonData: Params = { url, limit: 1, ...params }
58+
59+
try {
60+
const response: AxiosResponse = await this.postRequest('crawl', jsonData, headers)
61+
if (response.status === 200) {
62+
const responseData = response.data
63+
if (responseData[0].status) {
64+
return { success: true, data: responseData[0] }
65+
} else {
66+
throw new Error(`Failed to scrape URL. Error: ${responseData.error}`)
67+
}
68+
} else {
69+
this.handleError(response, 'scrape URL')
70+
}
71+
} catch (error: any) {
72+
throw new Error(error.message)
73+
}
74+
return { success: false, error: 'Internal server error.' }
75+
}
76+
77+
async crawlUrl(url: string, params: Params | null = null, idempotencyKey?: string): Promise<CrawlResponse | any> {
78+
const headers = this.prepareHeaders(idempotencyKey)
79+
const jsonData: Params = { url, ...params }
80+
81+
try {
82+
const response: AxiosResponse = await this.postRequest('crawl', jsonData, headers)
83+
if (response.status === 200) {
84+
return { success: true, data: response.data }
85+
} else {
86+
this.handleError(response, 'start crawl job')
87+
}
88+
} catch (error: any) {
89+
throw new Error(error.message)
90+
}
91+
return { success: false, error: 'Internal server error.' }
92+
}
93+
94+
private prepareHeaders(idempotencyKey?: string): AxiosRequestHeaders {
95+
return {
96+
'Content-Type': 'application/json',
97+
Authorization: `Bearer ${this.apiKey}`,
98+
...(idempotencyKey ? { 'x-idempotency-key': idempotencyKey } : {})
99+
} as AxiosRequestHeaders & { 'x-idempotency-key'?: string }
100+
}
101+
102+
private postRequest(url: string, data: Params, headers: AxiosRequestHeaders): Promise<AxiosResponse> {
103+
return axios.post(`${this.apiUrl}/${url}`, data, { headers })
104+
}
105+
106+
private handleError(response: AxiosResponse, action: string): void {
107+
if ([402, 408, 409, 500].includes(response.status)) {
108+
const errorMessage: string = response.data.error || 'Unknown error occurred'
109+
throw new Error(`Failed to ${action}. Status code: ${response.status}. Error: ${errorMessage}`)
110+
} else {
111+
throw new Error(`Unexpected error occurred while trying to ${action}. Status code: ${response.status}`)
112+
}
113+
}
114+
}
115+
116+
export default SpiderApp
Lines changed: 1 addition & 0 deletions
Loading

0 commit comments

Comments
 (0)