😺node の S3Client から S3, GCS, R2 を操作する2022/07/08に公開4件S3objectstoragetechDiscussionmizchi2024/05/30に更新使うAPIをリファクタして lib.ts に置いた。 r2 を s3 client から叩くコードも追加している。 lib.ts import type { } from "npm:@cloudflare/workers-types@4.20240524.0"; import { S3 } from "npm:@aws-sdk/client-s3@3.583.0"; // import hash from "npm:string-hash@1.1.3"; function checkEnv<K extends string>(keys: K[]): { [key in K]: string } { let valid = true; const results: { [key in K]: string } = {} as any; const missing_keys: K[] = []; for (const key of keys) { const value = Deno.env.get(key); if (!value) { missing_keys.push(key); valid = false; } else { results[key] = value; } } if (!valid) { throw new Error(`Missing environment variables: ${missing_keys.join(', ')}`); } return results as any; } const env = checkEnv([ 'CLOUDFLARE_API_TOKEN', 'CLOUDFLARE_ACCOUNT_ID', 'R2_ACCESS_KEY', 'R2_SECRET_KEY', 'R2_ENDPOINT', ]); const INDEX_NAME = 'embeddings-index'; // R2 const client = new S3({ endpoint: env.R2_ENDPOINT, region: 'auto', credentials: { accessKeyId: env.R2_ACCESS_KEY, secretAccessKey: env.R2_SECRET_KEY, }, }); type Embedding = { shape: number[]; data: number[][]; } type Result<T> = { result: T; success: true; errors: any[]; messages: any[]; } | { result: null; success: false; errors: any[]; messages: any[]; } async function runCfAi(model: string, args: any) { const endpoint = `https://api.cloudflare.com/client/v4/accounts/${env.CLOUDFLARE_ACCOUNT_ID}/ai/run/${model}`; return fetch( endpoint, { headers: { 'Authorization': `Bearer ${env.CLOUDFLARE_API_TOKEN}`, 'Content-Type': 'application/json', }, method: "POST", body: JSON.stringify(args), } ).then((res) => res.json()); } export async function getEmbeddingVectors(args: { text: string[] }): Promise<Result<Embedding>> { return await runCfAi('@cf/baai/bge-base-en-v1.5', args) as any; } async function runCfVectorize(method: string, args: any, { ndjson = true }: { ndjson?: boolean, } = {}) { const endpoint = `https://api.cloudflare.com/client/v4/accounts/${env.CLOUDFLARE_ACCOUNT_ID}/vectorize/indexes/${INDEX_NAME}/${method}`; return fetch( endpoint, { headers: { 'Authorization': `Bearer ${env.CLOUDFLARE_API_TOKEN}`, 'Content-Type': ndjson ? 'application/x-ndjson' : 'application/json', }, method: "POST", body: ndjson ? args.map((arg: any) => JSON.stringify(arg)).join('\n') : JSON.stringify(args), } ).then((res) => res.json()); } export async function upsertVectors(vectors: VectorizeVector[]) { return runCfVectorize('upsert', vectors, { ndjson: true }); } export async function insertVectors(vectors: VectorizeVector[]) { return runCfVectorize('insert', vectors, { ndjson: true }); } export async function queryVectors( vectors: number[], options: VectorizeQueryOptions ): Promise<Result<VectorizeMatches>> { return runCfVectorize('query', { ...options, vector: vectors, }, { ndjson: false }) as any; } export async function queryByText(queryText: string, options: VectorizeQueryOptions) { const query = await getEmbeddingVectors({ text: [queryText] }); if (!query.success) { console.error(query); throw new Error('Failed to get embedding vectors'); } const matches = await queryVectors(query.result.data[0], options); if (!matches.success) { console.error(matches); throw new Error('Failed to query vectors'); } return matches.result.matches; } export async function putObject(bucket: string, url: string, content: string) { return await client.putObject({ Key: url, Body: content, Bucket: bucket, }); } export async function deleteObject(bucket: string, key: string) { return await client.deleteObject({ Key: key, Bucket: bucket }); } export async function getObject(bucket: string, key: string) { const res = await client.getObject({ Key: key, Bucket: bucket, }); return res.Body!.transformToString(); } export async function listObjects(bucket: string) { return await client.listObjects({ Bucket: bucket, }); } これを使って shadcn-ui のドキュメントを vectorize に叩き込んでみる。 import type { } from "npm:@cloudflare/workers-types@4.20240524.0"; import { join } from "jsr:@std/path@0.221.0"; import { expandGlob } from "jsr:@std/fs@0.221.0/expand-glob"; import hash from "npm:string-hash@1.1.3"; import { getEmbeddingVectors, putObject, queryByText, getObject, upsertVectors } from "./lib.ts"; // この相対パスの先に shadcn-ui/ui が clone してある。 const base = new URL("../ui/apps/www/content/docs", import.meta.url).pathname; const rawGithubUrl = (path: string) => join(`https://raw.githubusercontent.com/shadcn-ui/ui/main/apps/www/content/docs`, path); async function uploadShadcnUIDocuments(): Promise<undefined> { const docs: Array<{ id: string, url: string, content: string }> = []; for await (const file of expandGlob('**/*.mdx', { includeDirs: false, root: base })) { const relpath = file.path.replace(base, ''); const url = rawGithubUrl(relpath); const content = await Deno.readTextFile(file.path); const id = hash(url).toString(); docs.push({ id, url, content }); } // store documents for (const doc of docs) { console.log("Storing document", doc.id, doc.url); await putObject('test-vectors', doc.id, doc.content); } const res = await getEmbeddingVectors({ text: docs.map(doc => doc.content) }); if (!res.success) { console.error(res); throw new Error('Failed to get embedding vectors'); } const newVectors: VectorizeVector[] = []; for (let i = 0; i < res.result.data.length; i++) { const doc = docs[i]; const vec = res.result.data[i]; newVectors.push({ id: doc.id, values: vec, metadata: { namespace: 'shadcn-ui-docs', url: doc.url } }); } await upsertVectors(newVectors); } await uploadShadcnUIDocuments(); 返信を追加mizchi2024/05/30に更新実際にベクトル検索してみる。 import type { } from "npm:@cloudflare/workers-types@4.20240524.0"; import { getObject, queryByText } from "./lib.ts"; const queryText = Deno.args.join(" "); console.log("Querying for", queryText); const matches = await queryByText(queryText, { topK: 1 }); const result = await getObject('test-vectors', matches[0].id); console.log(result); ボタンについて聞いてみる。 $ deno run -A --env build-doc.ts Button Querying for Button --- title: Button description: Displays a button or a component that looks like a button. featured: true component: true --- ... ボタン要素の使い方が説明された。 返信を追加mizchi2024/05/30embedding vector を生成する方法として、cloudflare ではなく openai api のモデルを使ってみる。 ここで vectorize を作り直した。ベクトル長が openai の text-embedding-3-small は 1536 で、text-embedding-3-large がその2倍の 3072。 $ npx wrangler vectorize create embeddings2 --dimensions=1536 --metric=cosine ちなみに 3072 の vectorize は生成できなかった。 import OpenAI from "npm:openai@4.47.2"; import { upsertVectors } from "./lib.ts"; const client = new OpenAI({ apiKey: Deno.env.get("OPENAI_API_KEY")! }); const embeddings = await client.embeddings.create({ model: "text-embedding-3-small", input: 'This is a story about an orange cloud', encoding_format: 'float', }); console.log(embeddings.data[0].embedding); const vector = embeddings.data[0]; const vectors: VectorizeVector = { id: "1", values: vector.embedding, }; const res = await upsertVectors([vectors]); if (!res.success) { console.error(res.errors[0].message); throw new Error('Failed to upsert vectors'); } console.log(res); これで保存できる。 返信を追加mizchi2024/05/30一応ローカルでコサイン類似度を計算できるようにしておく。ローカルと vectorize の実行結果が一致するかも確認する。 import OpenAI from "npm:openai@4.47.2"; import { queryVectors } from "./lib.ts"; const client = new OpenAI({ apiKey: Deno.env.get("OPENAI_API_KEY")! }); const embeddings = await client.embeddings.create({ model: "text-embedding-3-small", input: 'This is a story about an orange cloud', encoding_format: 'float', }); console.log(embeddings.data[0].embedding); const queryEmbeddings = await client.embeddings.create({ model: "text-embedding-3-small", input: 'orange cloud', encoding_format: 'float', }); const similarity = cosineSimilarity(embeddings.data[0].embedding, queryEmbeddings.data[0].embedding); console.log(similarity); const x = await queryVectors(queryEmbeddings.data[0].embedding, { topK: 1 }); console.log('calc on cloud', x); function cosineSimilarity(vecA: number[], vecB: number[]): number { if (vecA.length !== vecB.length) { throw new Error('Vectors must be of the same length'); } let dotProduct = 0; let normA = 0; let normB = 0; for (let i = 0; i < vecA.length; i++) { dotProduct += vecA[i] * vecB[i]; normA += vecA[i] * vecA[i]; normB += vecB[i] * vecB[i]; } if (normA === 0 || normB === 0) { throw new Error('Norm of a vector is zero, cannot calculate cosine similarity'); } return dotProduct / (Math.sqrt(normA) * Math.sqrt(normB)); } Result 0.5484223977832671 calc on cloud { result: { count: 1, matches: [ { id: "1", score: 0.548422398 } ] }, result_info: null, success: true, errors: [], messages: [] } 返信を追加
mizchi2024/05/30に更新使うAPIをリファクタして lib.ts に置いた。 r2 を s3 client から叩くコードも追加している。 lib.ts import type { } from "npm:@cloudflare/workers-types@4.20240524.0"; import { S3 } from "npm:@aws-sdk/client-s3@3.583.0"; // import hash from "npm:string-hash@1.1.3"; function checkEnv<K extends string>(keys: K[]): { [key in K]: string } { let valid = true; const results: { [key in K]: string } = {} as any; const missing_keys: K[] = []; for (const key of keys) { const value = Deno.env.get(key); if (!value) { missing_keys.push(key); valid = false; } else { results[key] = value; } } if (!valid) { throw new Error(`Missing environment variables: ${missing_keys.join(', ')}`); } return results as any; } const env = checkEnv([ 'CLOUDFLARE_API_TOKEN', 'CLOUDFLARE_ACCOUNT_ID', 'R2_ACCESS_KEY', 'R2_SECRET_KEY', 'R2_ENDPOINT', ]); const INDEX_NAME = 'embeddings-index'; // R2 const client = new S3({ endpoint: env.R2_ENDPOINT, region: 'auto', credentials: { accessKeyId: env.R2_ACCESS_KEY, secretAccessKey: env.R2_SECRET_KEY, }, }); type Embedding = { shape: number[]; data: number[][]; } type Result<T> = { result: T; success: true; errors: any[]; messages: any[]; } | { result: null; success: false; errors: any[]; messages: any[]; } async function runCfAi(model: string, args: any) { const endpoint = `https://api.cloudflare.com/client/v4/accounts/${env.CLOUDFLARE_ACCOUNT_ID}/ai/run/${model}`; return fetch( endpoint, { headers: { 'Authorization': `Bearer ${env.CLOUDFLARE_API_TOKEN}`, 'Content-Type': 'application/json', }, method: "POST", body: JSON.stringify(args), } ).then((res) => res.json()); } export async function getEmbeddingVectors(args: { text: string[] }): Promise<Result<Embedding>> { return await runCfAi('@cf/baai/bge-base-en-v1.5', args) as any; } async function runCfVectorize(method: string, args: any, { ndjson = true }: { ndjson?: boolean, } = {}) { const endpoint = `https://api.cloudflare.com/client/v4/accounts/${env.CLOUDFLARE_ACCOUNT_ID}/vectorize/indexes/${INDEX_NAME}/${method}`; return fetch( endpoint, { headers: { 'Authorization': `Bearer ${env.CLOUDFLARE_API_TOKEN}`, 'Content-Type': ndjson ? 'application/x-ndjson' : 'application/json', }, method: "POST", body: ndjson ? args.map((arg: any) => JSON.stringify(arg)).join('\n') : JSON.stringify(args), } ).then((res) => res.json()); } export async function upsertVectors(vectors: VectorizeVector[]) { return runCfVectorize('upsert', vectors, { ndjson: true }); } export async function insertVectors(vectors: VectorizeVector[]) { return runCfVectorize('insert', vectors, { ndjson: true }); } export async function queryVectors( vectors: number[], options: VectorizeQueryOptions ): Promise<Result<VectorizeMatches>> { return runCfVectorize('query', { ...options, vector: vectors, }, { ndjson: false }) as any; } export async function queryByText(queryText: string, options: VectorizeQueryOptions) { const query = await getEmbeddingVectors({ text: [queryText] }); if (!query.success) { console.error(query); throw new Error('Failed to get embedding vectors'); } const matches = await queryVectors(query.result.data[0], options); if (!matches.success) { console.error(matches); throw new Error('Failed to query vectors'); } return matches.result.matches; } export async function putObject(bucket: string, url: string, content: string) { return await client.putObject({ Key: url, Body: content, Bucket: bucket, }); } export async function deleteObject(bucket: string, key: string) { return await client.deleteObject({ Key: key, Bucket: bucket }); } export async function getObject(bucket: string, key: string) { const res = await client.getObject({ Key: key, Bucket: bucket, }); return res.Body!.transformToString(); } export async function listObjects(bucket: string) { return await client.listObjects({ Bucket: bucket, }); } これを使って shadcn-ui のドキュメントを vectorize に叩き込んでみる。 import type { } from "npm:@cloudflare/workers-types@4.20240524.0"; import { join } from "jsr:@std/path@0.221.0"; import { expandGlob } from "jsr:@std/fs@0.221.0/expand-glob"; import hash from "npm:string-hash@1.1.3"; import { getEmbeddingVectors, putObject, queryByText, getObject, upsertVectors } from "./lib.ts"; // この相対パスの先に shadcn-ui/ui が clone してある。 const base = new URL("../ui/apps/www/content/docs", import.meta.url).pathname; const rawGithubUrl = (path: string) => join(`https://raw.githubusercontent.com/shadcn-ui/ui/main/apps/www/content/docs`, path); async function uploadShadcnUIDocuments(): Promise<undefined> { const docs: Array<{ id: string, url: string, content: string }> = []; for await (const file of expandGlob('**/*.mdx', { includeDirs: false, root: base })) { const relpath = file.path.replace(base, ''); const url = rawGithubUrl(relpath); const content = await Deno.readTextFile(file.path); const id = hash(url).toString(); docs.push({ id, url, content }); } // store documents for (const doc of docs) { console.log("Storing document", doc.id, doc.url); await putObject('test-vectors', doc.id, doc.content); } const res = await getEmbeddingVectors({ text: docs.map(doc => doc.content) }); if (!res.success) { console.error(res); throw new Error('Failed to get embedding vectors'); } const newVectors: VectorizeVector[] = []; for (let i = 0; i < res.result.data.length; i++) { const doc = docs[i]; const vec = res.result.data[i]; newVectors.push({ id: doc.id, values: vec, metadata: { namespace: 'shadcn-ui-docs', url: doc.url } }); } await upsertVectors(newVectors); } await uploadShadcnUIDocuments(); 返信を追加
mizchi2024/05/30に更新実際にベクトル検索してみる。 import type { } from "npm:@cloudflare/workers-types@4.20240524.0"; import { getObject, queryByText } from "./lib.ts"; const queryText = Deno.args.join(" "); console.log("Querying for", queryText); const matches = await queryByText(queryText, { topK: 1 }); const result = await getObject('test-vectors', matches[0].id); console.log(result); ボタンについて聞いてみる。 $ deno run -A --env build-doc.ts Button Querying for Button --- title: Button description: Displays a button or a component that looks like a button. featured: true component: true --- ... ボタン要素の使い方が説明された。 返信を追加
mizchi2024/05/30embedding vector を生成する方法として、cloudflare ではなく openai api のモデルを使ってみる。 ここで vectorize を作り直した。ベクトル長が openai の text-embedding-3-small は 1536 で、text-embedding-3-large がその2倍の 3072。 $ npx wrangler vectorize create embeddings2 --dimensions=1536 --metric=cosine ちなみに 3072 の vectorize は生成できなかった。 import OpenAI from "npm:openai@4.47.2"; import { upsertVectors } from "./lib.ts"; const client = new OpenAI({ apiKey: Deno.env.get("OPENAI_API_KEY")! }); const embeddings = await client.embeddings.create({ model: "text-embedding-3-small", input: 'This is a story about an orange cloud', encoding_format: 'float', }); console.log(embeddings.data[0].embedding); const vector = embeddings.data[0]; const vectors: VectorizeVector = { id: "1", values: vector.embedding, }; const res = await upsertVectors([vectors]); if (!res.success) { console.error(res.errors[0].message); throw new Error('Failed to upsert vectors'); } console.log(res); これで保存できる。 返信を追加
mizchi2024/05/30一応ローカルでコサイン類似度を計算できるようにしておく。ローカルと vectorize の実行結果が一致するかも確認する。 import OpenAI from "npm:openai@4.47.2"; import { queryVectors } from "./lib.ts"; const client = new OpenAI({ apiKey: Deno.env.get("OPENAI_API_KEY")! }); const embeddings = await client.embeddings.create({ model: "text-embedding-3-small", input: 'This is a story about an orange cloud', encoding_format: 'float', }); console.log(embeddings.data[0].embedding); const queryEmbeddings = await client.embeddings.create({ model: "text-embedding-3-small", input: 'orange cloud', encoding_format: 'float', }); const similarity = cosineSimilarity(embeddings.data[0].embedding, queryEmbeddings.data[0].embedding); console.log(similarity); const x = await queryVectors(queryEmbeddings.data[0].embedding, { topK: 1 }); console.log('calc on cloud', x); function cosineSimilarity(vecA: number[], vecB: number[]): number { if (vecA.length !== vecB.length) { throw new Error('Vectors must be of the same length'); } let dotProduct = 0; let normA = 0; let normB = 0; for (let i = 0; i < vecA.length; i++) { dotProduct += vecA[i] * vecB[i]; normA += vecA[i] * vecA[i]; normB += vecB[i] * vecB[i]; } if (normA === 0 || normB === 0) { throw new Error('Norm of a vector is zero, cannot calculate cosine similarity'); } return dotProduct / (Math.sqrt(normA) * Math.sqrt(normB)); } Result 0.5484223977832671 calc on cloud { result: { count: 1, matches: [ { id: "1", score: 0.548422398 } ] }, result_info: null, success: true, errors: [], messages: [] } 返信を追加
Discussion
使うAPIをリファクタして lib.ts に置いた。
r2 を s3 client から叩くコードも追加している。
lib.ts
これを使って shadcn-ui のドキュメントを vectorize に叩き込んでみる。
実際にベクトル検索してみる。
ボタンについて聞いてみる。
ボタン要素の使い方が説明された。
embedding vector を生成する方法として、cloudflare ではなく openai api のモデルを使ってみる。
ここで vectorize を作り直した。ベクトル長が openai の text-embedding-3-small は 1536 で、text-embedding-3-large がその2倍の 3072。
ちなみに 3072 の vectorize は生成できなかった。
これで保存できる。
一応ローカルでコサイン類似度を計算できるようにしておく。ローカルと vectorize の実行結果が一致するかも確認する。
Result