Skip to content

Commit 2d75c01

Browse files
committed
feat(document_loaders): add SupadataLoader
1 parent 6f8fa47 commit 2d75c01

File tree

2 files changed

+282
-0
lines changed

2 files changed

+282
-0
lines changed
Lines changed: 95 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,95 @@
1+
import {
2+
test,
3+
expect,
4+
jest,
5+
describe,
6+
beforeEach,
7+
afterAll,
8+
} from "@jest/globals";
9+
import { SupadataLoader } from "../web/supadata.js";
10+
11+
const mockTranscript = jest.fn();
12+
const mockYoutubeVideo = jest.fn();
13+
14+
const mockSupadataConstructor = jest.fn().mockImplementation(() => ({
15+
transcript: mockTranscript,
16+
youtube: {
17+
video: mockYoutubeVideo,
18+
},
19+
}));
20+
21+
jest.mock("@supadata/js", () => {
22+
return {
23+
Supadata: mockSupadataConstructor,
24+
};
25+
});
26+
27+
const REAL_ENV = process.env;
28+
29+
beforeEach(() => {
30+
process.env = { ...REAL_ENV };
31+
jest.clearAllMocks();
32+
mockTranscript.mockReset();
33+
mockYoutubeVideo.mockReset();
34+
mockSupadataConstructor.mockClear();
35+
});
36+
37+
afterAll(() => {
38+
process.env = REAL_ENV;
39+
});
40+
41+
describe("SupadataLoader", () => {
42+
test("initializes with API key", async () => {
43+
mockTranscript.mockResolvedValue({ content: "test", lang: "en" });
44+
45+
const loader = new SupadataLoader({
46+
urls: ["https://youtube.com/watch?v=123"],
47+
apiKey: "test-key",
48+
});
49+
50+
await loader.load();
51+
52+
expect(mockSupadataConstructor).toHaveBeenCalledWith({ apiKey: "test-key" });
53+
});
54+
55+
test("fetches transcript successfully", async () => {
56+
mockTranscript.mockResolvedValue({
57+
content: "Hello world",
58+
lang: "en",
59+
});
60+
61+
const loader = new SupadataLoader({
62+
urls: ["https://youtube.com/watch?v=123"],
63+
apiKey: "test-key",
64+
operation: "transcript",
65+
});
66+
67+
const docs = await loader.load();
68+
69+
expect(mockTranscript).toHaveBeenCalledWith(
70+
expect.objectContaining({
71+
url: "https://youtube.com/watch?v=123",
72+
text: true,
73+
}),
74+
);
75+
expect(docs).toHaveLength(1);
76+
expect(docs[0].pageContent).toBe("Hello world");
77+
});
78+
79+
test("fetches metadata successfully", async () => {
80+
mockYoutubeVideo.mockResolvedValue({ title: "Awesome Video" });
81+
82+
const loader = new SupadataLoader({
83+
urls: ["https://youtube.com/watch?v=123"],
84+
apiKey: "test-key",
85+
operation: "metadata",
86+
});
87+
88+
const docs = await loader.load();
89+
90+
expect(mockYoutubeVideo).toHaveBeenCalled();
91+
expect(docs).toHaveLength(1);
92+
expect(docs[0].pageContent).toContain("Awesome Video");
93+
expect(docs[0].metadata.supadataOperation).toBe("metadata");
94+
});
95+
});
Lines changed: 187 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,187 @@
1+
import { Document } from "@langchain/core/documents";
2+
import { getEnvironmentVariable } from "@langchain/core/utils/env";
3+
import { BaseDocumentLoader } from "@langchain/core/document_loaders/base";
4+
5+
export type SupadataOperation = "metadata" | "transcript";
6+
7+
export interface SupadataLoaderParams {
8+
/** URLs to load (YouTube, web pages, etc.). */
9+
urls: string[];
10+
/**
11+
* Supadata API key. If omitted, falls back to SUPADATA_API_KEY env var.
12+
*/
13+
apiKey?: string;
14+
/**
15+
* Operation to perform. "metadata" returns structured info,
16+
* "transcript" returns textual content. Default: "transcript".
17+
*/
18+
operation?: SupadataOperation;
19+
/** Preferred transcript language, e.g. "en". */
20+
lang?: string;
21+
/**
22+
* If true, return plain-text transcript instead of timestamped chunks.
23+
* Default: true.
24+
*/
25+
text?: boolean;
26+
/** Transcript mode, e.g. "native", "auto", or "generate". */
27+
mode?: "native" | "auto" | "generate";
28+
/** Extra parameters forwarded directly to the Supadata SDK. */
29+
params?: Record<string, unknown>;
30+
}
31+
32+
/**
33+
* Document loader that wraps the Supadata JavaScript SDK.
34+
*
35+
* Supports two operations:
36+
* - "transcript": fetch a transcript for the given URL
37+
* - "metadata": fetch metadata for the given URL
38+
*
39+
* The Supadata API key is read either from the `apiKey` parameter or from
40+
* the `SUPADATA_API_KEY` environment variable.
41+
*/
42+
export class SupadataLoader extends BaseDocumentLoader {
43+
private readonly urls: string[];
44+
45+
private readonly apiKey?: string;
46+
47+
private readonly operation: SupadataOperation;
48+
49+
private readonly lang?: string;
50+
51+
private readonly text: boolean;
52+
53+
private readonly mode?: "native" | "auto" | "generate";
54+
55+
private readonly params: Record<string, unknown>;
56+
57+
constructor(params: SupadataLoaderParams) {
58+
super();
59+
60+
if (!params.urls || params.urls.length === 0) {
61+
throw new Error(
62+
"SupadataLoader: at least one URL is required in `urls`.",
63+
);
64+
}
65+
66+
this.urls = params.urls;
67+
this.apiKey = params.apiKey;
68+
this.operation = params.operation ?? "transcript";
69+
this.lang = params.lang;
70+
this.text = params.text ?? true;
71+
this.mode = params.mode;
72+
this.params = params.params ?? {};
73+
}
74+
75+
async load(): Promise<Document[]> {
76+
const client = await this.getClient();
77+
const docs: Document[] = [];
78+
79+
for (const url of this.urls) {
80+
try {
81+
if (this.operation === "metadata") {
82+
docs.push(await this.loadMetadata(client, url));
83+
} else if (this.operation === "transcript") {
84+
docs.push(await this.loadTranscript(client, url));
85+
} else {
86+
throw new Error(
87+
`SupadataLoader: unsupported operation "${this.operation}". Use "metadata" or "transcript".`,
88+
);
89+
}
90+
} catch (e: any) {
91+
// Surface the failure but keep other URLs processing.
92+
// eslint-disable-next-line no-console
93+
console.warn(`SupadataLoader: failed to load ${url}: ${e?.message ?? e}`);
94+
}
95+
}
96+
97+
return docs;
98+
}
99+
100+
private resolveApiKey(): string {
101+
if (this.apiKey) {
102+
return this.apiKey;
103+
}
104+
105+
const envKey = getEnvironmentVariable("SUPADATA_API_KEY");
106+
if (!envKey) {
107+
throw new Error(
108+
"SupadataLoader: Supadata API key not found. Pass `apiKey` to the loader or set the SUPADATA_API_KEY environment variable.",
109+
);
110+
}
111+
return envKey;
112+
}
113+
114+
private async getClient(): Promise<any> {
115+
const apiKey = this.resolveApiKey();
116+
117+
try {
118+
const { Supadata } = await import("@supadata/js");
119+
return new Supadata({ apiKey });
120+
} catch {
121+
throw new Error(
122+
"SupadataLoader: failed to load `@supadata/js`. Please install it with `npm install @supadata/js` (or `pnpm add @supadata/js`).",
123+
);
124+
}
125+
}
126+
127+
private async loadMetadata(client: any, url: string): Promise<Document> {
128+
const isYoutube = url.includes("youtube.com") || url.includes("youtu.be");
129+
130+
let result;
131+
if (isYoutube && client.youtube?.video) {
132+
result = await client.youtube.video({ url, ...this.params });
133+
} else if (client.web?.scrape) {
134+
result = await client.web.scrape({ url, ...this.params });
135+
} else {
136+
throw new Error(
137+
"SupadataLoader: could not determine a Supadata SDK method to call for metadata. " +
138+
"Ensure the SDK version exposes either `youtube.video` or `web.scrape`.",
139+
);
140+
}
141+
142+
return new Document({
143+
pageContent: JSON.stringify(result, null, 2),
144+
metadata: {
145+
source: url,
146+
supadataOperation: "metadata",
147+
},
148+
});
149+
}
150+
151+
private async loadTranscript(client: any, url: string): Promise<Document> {
152+
const payload: Record<string, unknown> = {
153+
url,
154+
text: this.text,
155+
...this.params,
156+
};
157+
158+
if (this.lang) {
159+
payload.lang = this.lang;
160+
}
161+
if (this.mode) {
162+
payload.mode = this.mode;
163+
}
164+
165+
const result = await client.transcript(payload);
166+
167+
if (result.jobId) {
168+
return new Document({
169+
pageContent: `Transcript processing. Job ID: ${result.jobId}`,
170+
metadata: {
171+
source: url,
172+
supadataOperation: "transcript_job",
173+
jobId: result.jobId,
174+
},
175+
});
176+
}
177+
178+
return new Document({
179+
pageContent: result.content,
180+
metadata: {
181+
source: url,
182+
supadataOperation: "transcript",
183+
lang: result.lang,
184+
},
185+
});
186+
}
187+
}

0 commit comments

Comments
 (0)