Skip to content

Commit bf73113

Browse files
community[minor]: updated Browserbase loader (#5412)
* updated browserbase loader * ran format * Move to community * Format * Loosen peer dep --------- Co-authored-by: Mish Ushakov <mishushakov@users.noreply.github.com> Co-authored-by: jacoblee93 <jacoblee93@gmail.com>
1 parent eadf62e commit bf73113

File tree

10 files changed

+140
-16
lines changed

10 files changed

+140
-16
lines changed

‎docs/core_docs/docs/integrations/document_loaders/web_loaders/browserbase.mdx‎

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,18 @@
22

33
## Description
44

5-
[Browserbase](https://browserbase.com) is a serverless platform for running headless browsers, it offers advanced debugging, session recordings, stealth mode, integrated proxies and captcha solving.
5+
[Browserbase](https://browserbase.com) is a developer platform to reliably run, manage, and monitor headless browsers.
6+
7+
Power your AI data retrievals with:
8+
9+
- [Serverless Infrastructure](https://docs.browserbase.com/under-the-hood) providing reliable browsers to extract data from complex UIs
10+
- [Stealth Mode](https://docs.browserbase.com/features/stealth-mode) with included fingerprinting tactics and automatic captcha solving
11+
- [Session Debugger](https://docs.browserbase.com/features/sessions) to inspect your Browser Session with networks timeline and logs
12+
- [Live Debug](https://docs.browserbase.com/guides/session-debug-connection/browser-remote-control) to quickly debug your automation
613

714
## Installation
815

9-
- Get an API key from [browserbase.com](https://browserbase.com) and set it in environment variables (`BROWSERBASE_API_KEY`).
16+
- Get an API key and Project ID from [browserbase.com](https://browserbase.com) and set it in environment variables (`BROWSERBASE_API_KEY`, `BROWSERBASE_PROJECT_ID`).
1017
- Install the [Browserbase SDK](http://github.com/browserbase/js-sdk):
1118

1219
```bash npm2yarn
@@ -28,5 +35,6 @@ import Example from "@examples/document_loaders/browserbase.ts";
2835

2936
## Options
3037

31-
- `api_key`: Optional. Specifies Browserbase API key. Defaults is the `BROWSERBASE_API_KEY` environment variable.
32-
- `text_content`: Optional. Load pages as readable text. Default is `False`.
38+
- `textContent` Retrieve only text content. Default is `false`.
39+
- `sessionId` Optional. Provide an existing Session ID.
40+
- `proxy` Optional. Enable/Disable Proxies.

‎examples/src/document_loaders/browserbase.ts‎

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
import { BrowserbaseLoader } from "langchain/document_loaders/web/browserbase";
1+
import { BrowserbaseLoader } from "@langchain/community/document_loaders/web/browserbase";
22

33
const loader = new BrowserbaseLoader(["https://example.com"], {
44
textContent: true,

‎langchain/package.json‎

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -594,7 +594,7 @@
594594
"@aws-sdk/credential-provider-node": "^3.388.0",
595595
"@aws-sdk/types": "^3.357.0",
596596
"@azure/storage-blob": "^12.15.0",
597-
"@browserbasehq/sdk": "^1.0.0",
597+
"@browserbasehq/sdk": "^1.1.5",
598598
"@cloudflare/workers-types": "^4.20230922.0",
599599
"@faker-js/faker": "^7.6.0",
600600
"@gomomento/sdk": "^1.51.1",

‎langchain/src/document_loaders/web/browserbase.ts‎

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,18 @@
11
import { Document, type DocumentInterface } from "@langchain/core/documents";
2-
import Browserbase, { BrowserbaseLoadOptions } from "@browserbasehq/sdk";
2+
import Browserbase, { LoadOptions, ClientOptions } from "@browserbasehq/sdk";
33
import { BaseDocumentLoader } from "../base.js";
44
import type { DocumentLoader } from "../base.js";
5+
import { logVersion020MigrationWarning } from "../../util/entrypoint_deprecation.js";
56

6-
interface BrowserbaseLoaderOptions extends BrowserbaseLoadOptions {
7-
apiKey?: string;
8-
}
7+
/* #__PURE__ */ logVersion020MigrationWarning({
8+
oldEntrypointName: "document_loaders/web/browserbase",
9+
newPackageName: "@langchain/community",
10+
});
11+
12+
type BrowserbaseLoaderOptions = ClientOptions & LoadOptions;
913

1014
/**
15+
* @deprecated Import from "@langchain/community/document_loaders/web/browserbase" instead. This entrypoint will be removed in 0.3.0.
1116
* Load pre-rendered web pages using a headless browser hosted on Browserbase.
1217
*
1318
* Depends on `@browserbasehq/sdk` package.
@@ -42,7 +47,7 @@ export class BrowserbaseLoader
4247
super();
4348
this.urls = urls;
4449
this.options = options;
45-
this.browserbase = new Browserbase(options.apiKey);
50+
this.browserbase = new Browserbase(options);
4651
}
4752

4853
/**

‎libs/langchain-community/.gitignore‎

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -794,6 +794,10 @@ document_loaders/web/azure_blob_storage_file.cjs
794794
document_loaders/web/azure_blob_storage_file.js
795795
document_loaders/web/azure_blob_storage_file.d.ts
796796
document_loaders/web/azure_blob_storage_file.d.cts
797+
document_loaders/web/browserbase.cjs
798+
document_loaders/web/browserbase.js
799+
document_loaders/web/browserbase.d.ts
800+
document_loaders/web/browserbase.d.cts
797801
document_loaders/web/cheerio.cjs
798802
document_loaders/web/cheerio.js
799803
document_loaders/web/cheerio.d.ts

‎libs/langchain-community/langchain.config.js‎

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -248,6 +248,7 @@ export const config = {
248248
"document_loaders/web/azure_blob_storage_container",
249249
"document_loaders/web/azure_blob_storage_file":
250250
"document_loaders/web/azure_blob_storage_file",
251+
"document_loaders/web/browserbase": "document_loaders/web/browserbase",
251252
"document_loaders/web/cheerio": "document_loaders/web/cheerio",
252253
"document_loaders/web/puppeteer": "document_loaders/web/puppeteer",
253254
"document_loaders/web/playwright": "document_loaders/web/playwright",
@@ -447,6 +448,7 @@ export const config = {
447448
"document_loaders/web/assemblyai",
448449
"document_loaders/web/azure_blob_storage_container",
449450
"document_loaders/web/azure_blob_storage_file",
451+
"document_loaders/web/browserbase",
450452
"document_loaders/web/cheerio",
451453
"document_loaders/web/puppeteer",
452454
"document_loaders/web/playwright",

‎libs/langchain-community/package.json‎

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,7 @@
6565
"@aws-sdk/types": "^3.357.0",
6666
"@azure/search-documents": "^12.0.0",
6767
"@azure/storage-blob": "^12.15.0",
68+
"@browserbasehq/sdk": "^1.1.5",
6869
"@clickhouse/client": "^0.2.5",
6970
"@cloudflare/ai": "^1.0.12",
7071
"@cloudflare/workers-types": "^4.20230922.0",
@@ -221,6 +222,7 @@
221222
"@aws-sdk/credential-provider-node": "^3.388.0",
222223
"@azure/search-documents": "^12.0.0",
223224
"@azure/storage-blob": "^12.15.0",
225+
"@browserbasehq/sdk": "*",
224226
"@clickhouse/client": "^0.2.5",
225227
"@cloudflare/ai": "*",
226228
"@datastax/astra-db-ts": "^1.0.0",
@@ -361,6 +363,9 @@
361363
"@azure/storage-blob": {
362364
"optional": true
363365
},
366+
"@browserbasehq/sdk": {
367+
"optional": true
368+
},
364369
"@clickhouse/client": {
365370
"optional": true
366371
},
@@ -2463,6 +2468,15 @@
24632468
"import": "./document_loaders/web/azure_blob_storage_file.js",
24642469
"require": "./document_loaders/web/azure_blob_storage_file.cjs"
24652470
},
2471+
"./document_loaders/web/browserbase": {
2472+
"types": {
2473+
"import": "./document_loaders/web/browserbase.d.ts",
2474+
"require": "./document_loaders/web/browserbase.d.cts",
2475+
"default": "./document_loaders/web/browserbase.d.ts"
2476+
},
2477+
"import": "./document_loaders/web/browserbase.js",
2478+
"require": "./document_loaders/web/browserbase.cjs"
2479+
},
24662480
"./document_loaders/web/cheerio": {
24672481
"types": {
24682482
"import": "./document_loaders/web/cheerio.d.ts",
@@ -3632,6 +3646,10 @@
36323646
"document_loaders/web/azure_blob_storage_file.js",
36333647
"document_loaders/web/azure_blob_storage_file.d.ts",
36343648
"document_loaders/web/azure_blob_storage_file.d.cts",
3649+
"document_loaders/web/browserbase.cjs",
3650+
"document_loaders/web/browserbase.js",
3651+
"document_loaders/web/browserbase.d.ts",
3652+
"document_loaders/web/browserbase.d.cts",
36353653
"document_loaders/web/cheerio.cjs",
36363654
"document_loaders/web/cheerio.js",
36373655
"document_loaders/web/cheerio.d.ts",
Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
import { Document, type DocumentInterface } from "@langchain/core/documents";
2+
import {
3+
BaseDocumentLoader,
4+
type DocumentLoader,
5+
} from "@langchain/core/document_loaders/base";
6+
import Browserbase, { LoadOptions, ClientOptions } from "@browserbasehq/sdk";
7+
8+
type BrowserbaseLoaderOptions = ClientOptions & LoadOptions;
9+
10+
/**
11+
* Load pre-rendered web pages using a headless browser hosted on Browserbase.
12+
*
13+
* Depends on `@browserbasehq/sdk` package.
14+
* Get your API key from https://browserbase.com
15+
*
16+
* @example
17+
* ```typescript
18+
* import { BrowserbaseLoader } from "langchain/document_loaders/web/browserbase";
19+
*
20+
* const loader = new BrowserbaseLoader(["https://example.com"], {
21+
* apiKey: process.env.BROWSERBASE_API_KEY,
22+
* textContent: true,
23+
* });
24+
*
25+
* const docs = await loader.load();
26+
* ```
27+
*
28+
* @param {string[]} urls - The URLs of the web pages to load.
29+
* @param {BrowserbaseLoaderOptions} [options] - Browserbase client options.
30+
*/
31+
export class BrowserbaseLoader
32+
extends BaseDocumentLoader
33+
implements DocumentLoader
34+
{
35+
urls: string[];
36+
37+
options: BrowserbaseLoaderOptions;
38+
39+
browserbase: Browserbase;
40+
41+
constructor(urls: string[], options: BrowserbaseLoaderOptions = {}) {
42+
super();
43+
this.urls = urls;
44+
this.options = options;
45+
this.browserbase = new Browserbase(options);
46+
}
47+
48+
/**
49+
* Load pages from URLs.
50+
*
51+
* @returns {Promise<DocumentInterface[]>} - A promise which resolves to a list of documents.
52+
*/
53+
async load(): Promise<DocumentInterface[]> {
54+
const documents: DocumentInterface[] = [];
55+
for await (const doc of this.lazyLoad()) {
56+
documents.push(doc);
57+
}
58+
59+
return documents;
60+
}
61+
62+
/**
63+
* Load pages from URLs.
64+
*
65+
* @returns {Generator<DocumentInterface>} - A generator that yields documents.
66+
*/
67+
async *lazyLoad() {
68+
const pages = await this.browserbase.loadURLs(this.urls, this.options);
69+
70+
let index = 0;
71+
for await (const page of pages) {
72+
yield new Document({
73+
pageContent: page,
74+
metadata: {
75+
url: this.urls[index],
76+
},
77+
});
78+
79+
index += index + 1;
80+
}
81+
}
82+
}

‎libs/langchain-community/src/load/import_constants.ts‎

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -134,6 +134,7 @@ export const optionalImportEntrypoints: string[] = [
134134
"langchain_community/document_loaders/web/assemblyai",
135135
"langchain_community/document_loaders/web/azure_blob_storage_container",
136136
"langchain_community/document_loaders/web/azure_blob_storage_file",
137+
"langchain_community/document_loaders/web/browserbase",
137138
"langchain_community/document_loaders/web/cheerio",
138139
"langchain_community/document_loaders/web/puppeteer",
139140
"langchain_community/document_loaders/web/playwright",

‎yarn.lock‎

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -6623,13 +6623,13 @@ __metadata:
66236623
languageName: node
66246624
linkType: hard
66256625

6626-
"@browserbasehq/sdk@npm:^1.0.0":
6627-
version: 1.0.0
6628-
resolution: "@browserbasehq/sdk@npm:1.0.0"
6626+
"@browserbasehq/sdk@npm:^1.1.5":
6627+
version: 1.1.5
6628+
resolution: "@browserbasehq/sdk@npm:1.1.5"
66296629
dependencies:
66306630
playwright: ^1.43.1
66316631
zod: ^3.22.5
6632-
checksum: 1aa7d6fd9e7550bdb7fff43a3c858227bcb9fcb26e9c4ee0ae245e36a1fa90d9a378f6937c620ef0bbe8e400710005778d945d180033639afa50a564c975a3ae
6632+
checksum: 9b62d6471c4f706af881b58b8fcc0c06ea48bc4b3b2d97c71265a8294cf38afefc61609b21c7d6564a312a91359f645e90462875582c73004420ed2a2bf4e1bc
66336633
languageName: node
66346634
linkType: hard
66356635

@@ -9016,6 +9016,7 @@ __metadata:
90169016
"@aws-sdk/types": ^3.357.0
90179017
"@azure/search-documents": ^12.0.0
90189018
"@azure/storage-blob": ^12.15.0
9019+
"@browserbasehq/sdk": ^1.1.5
90199020
"@clickhouse/client": ^0.2.5
90209021
"@cloudflare/ai": ^1.0.12
90219022
"@cloudflare/workers-types": ^4.20230922.0
@@ -9182,6 +9183,7 @@ __metadata:
91829183
"@aws-sdk/credential-provider-node": ^3.388.0
91839184
"@azure/search-documents": ^12.0.0
91849185
"@azure/storage-blob": ^12.15.0
9186+
"@browserbasehq/sdk": "*"
91859187
"@clickhouse/client": ^0.2.5
91869188
"@cloudflare/ai": "*"
91879189
"@datastax/astra-db-ts": ^1.0.0
@@ -9309,6 +9311,8 @@ __metadata:
93099311
optional: true
93109312
"@azure/storage-blob":
93119313
optional: true
9314+
"@browserbasehq/sdk":
9315+
optional: true
93129316
"@clickhouse/client":
93139317
optional: true
93149318
"@cloudflare/ai":
@@ -27375,7 +27379,7 @@ __metadata:
2737527379
"@aws-sdk/credential-provider-node": ^3.388.0
2737627380
"@aws-sdk/types": ^3.357.0
2737727381
"@azure/storage-blob": ^12.15.0
27378-
"@browserbasehq/sdk": ^1.0.0
27382+
"@browserbasehq/sdk": ^1.1.5
2737927383
"@cloudflare/workers-types": ^4.20230922.0
2738027384
"@faker-js/faker": ^7.6.0
2738127385
"@gomomento/sdk": ^1.51.1

0 commit comments

Comments
 (0)