😺

node の S3Client から S3, GCS, R2 を操作する

に公開4

Discussion

mizchimizchi

使うAPIをリファクタして lib.ts に置いた。
r2 を s3 client から叩くコードも追加している。

lib.ts

import type { } from "npm:@cloudflare/workers-types@4.20240524.0";
import { S3 } from "npm:@aws-sdk/client-s3@3.583.0";
// import hash from "npm:string-hash@1.1.3";

function checkEnv<K extends string>(keys: K[]): { [key in K]: string } {
  let valid = true;
  const results: { [key in K]: string } = {} as any;
  const missing_keys: K[] = [];
  for (const key of keys) {
    const value = Deno.env.get(key);
    if (!value) {
      missing_keys.push(key);
      valid = false;
    } else {
      results[key] = value;
    }
  }
  if (!valid) {
    throw new Error(`Missing environment variables: ${missing_keys.join(', ')}`);
  }
  return results as any;
}

const env = checkEnv([
  'CLOUDFLARE_API_TOKEN',
  'CLOUDFLARE_ACCOUNT_ID',
  'R2_ACCESS_KEY',
  'R2_SECRET_KEY',
  'R2_ENDPOINT',
]);

const INDEX_NAME = 'embeddings-index';
// R2
const client = new S3({
  endpoint: env.R2_ENDPOINT,
  region: 'auto',
  credentials: {
    accessKeyId: env.R2_ACCESS_KEY,
    secretAccessKey: env.R2_SECRET_KEY,
  },
});

type Embedding = {
  shape: number[];
  data: number[][];
}

type Result<T> = {
  result: T;
  success: true;
  errors: any[];
  messages: any[];
} | {
  result: null;
  success: false;
  errors: any[];
  messages: any[];
}

async function runCfAi(model: string, args: any) {
  const endpoint = `https://api.cloudflare.com/client/v4/accounts/${env.CLOUDFLARE_ACCOUNT_ID}/ai/run/${model}`;
  return fetch(
    endpoint,
    {
      headers: {
        'Authorization': `Bearer ${env.CLOUDFLARE_API_TOKEN}`,
        'Content-Type': 'application/json',
      },
      method: "POST",
      body: JSON.stringify(args),
    }
  ).then((res) => res.json());
}

export async function getEmbeddingVectors(args: { text: string[] }): Promise<Result<Embedding>> {
  return await runCfAi('@cf/baai/bge-base-en-v1.5', args) as any;
}

async function runCfVectorize(method: string, args: any, { ndjson = true }: {
  ndjson?: boolean,
} = {}) {
  const endpoint = `https://api.cloudflare.com/client/v4/accounts/${env.CLOUDFLARE_ACCOUNT_ID}/vectorize/indexes/${INDEX_NAME}/${method}`;
  return fetch(
    endpoint,
    {
      headers: {
        'Authorization': `Bearer ${env.CLOUDFLARE_API_TOKEN}`,
        'Content-Type': ndjson ? 'application/x-ndjson' : 'application/json',
      },
      method: "POST",
      body: ndjson ? args.map((arg: any) => JSON.stringify(arg)).join('\n') : JSON.stringify(args),
    }
  ).then((res) => res.json());
}

export async function upsertVectors(vectors: VectorizeVector[]) {
  return runCfVectorize('upsert', vectors, { ndjson: true });
}

export async function insertVectors(vectors: VectorizeVector[]) {
  return runCfVectorize('insert', vectors, { ndjson: true });
}

export async function queryVectors(
  vectors: number[],
  options: VectorizeQueryOptions
): Promise<Result<VectorizeMatches>> {
  return runCfVectorize('query', {
    ...options,
    vector: vectors,
  }, { ndjson: false }) as any;
}

export async function queryByText(queryText: string, options: VectorizeQueryOptions) {
  const query = await getEmbeddingVectors({ text: [queryText] });
  if (!query.success) {
    console.error(query);
    throw new Error('Failed to get embedding vectors');
  }

  const matches = await queryVectors(query.result.data[0], options);
  if (!matches.success) {
    console.error(matches);
    throw new Error('Failed to query vectors');
  }
  return matches.result.matches;
}


export async function putObject(bucket: string, url: string, content: string) {
  return await client.putObject({
    Key: url,
    Body: content,
    Bucket: bucket,
  });
}

export async function deleteObject(bucket: string, key: string) {
  return await client.deleteObject({
    Key: key,
    Bucket: bucket
  });
}

export async function getObject(bucket: string, key: string) {
  const res = await client.getObject({
    Key: key,
    Bucket: bucket,
  });
  return res.Body!.transformToString();
}

export async function listObjects(bucket: string) {
  return await client.listObjects({
    Bucket: bucket,
  });
}

これを使って shadcn-ui のドキュメントを vectorize に叩き込んでみる。

import type { } from "npm:@cloudflare/workers-types@4.20240524.0";
import { join } from "jsr:@std/path@0.221.0";
import { expandGlob } from "jsr:@std/fs@0.221.0/expand-glob";
import hash from "npm:string-hash@1.1.3";
import { getEmbeddingVectors, putObject, queryByText, getObject, upsertVectors } from "./lib.ts";

// この相対パスの先に shadcn-ui/ui が clone してある。
const base = new URL("../ui/apps/www/content/docs", import.meta.url).pathname;
const rawGithubUrl = (path: string) => join(`https://raw.githubusercontent.com/shadcn-ui/ui/main/apps/www/content/docs`, path);

async function uploadShadcnUIDocuments(): Promise<undefined> {
  const docs: Array<{
    id: string,
    url: string,
    content: string
  }> = [];
  for await (const file of expandGlob('**/*.mdx', { includeDirs: false, root: base })) {
    const relpath = file.path.replace(base, '');
    const url = rawGithubUrl(relpath);
    const content = await Deno.readTextFile(file.path);
    const id = hash(url).toString();
    docs.push({ id, url, content });
  }

  // store documents
  for (const doc of docs) {
    console.log("Storing document", doc.id, doc.url);
    await putObject('test-vectors', doc.id, doc.content);
  }

  const res = await getEmbeddingVectors({ text: docs.map(doc => doc.content) });
  if (!res.success) {
    console.error(res);
    throw new Error('Failed to get embedding vectors');
  }
  const newVectors: VectorizeVector[] = [];
  for (let i = 0; i < res.result.data.length; i++) {
    const doc = docs[i];
    const vec = res.result.data[i];
    newVectors.push({
      id: doc.id,
      values: vec,
      metadata: {
        namespace: 'shadcn-ui-docs',
        url: doc.url
      }
    });
  }
  await upsertVectors(newVectors);
}

await uploadShadcnUIDocuments();
mizchimizchi

実際にベクトル検索してみる。

import type { } from "npm:@cloudflare/workers-types@4.20240524.0";
import { getObject, queryByText } from "./lib.ts";
const queryText = Deno.args.join(" ");
console.log("Querying for", queryText);
const matches = await queryByText(queryText, { topK: 1 });
const result = await getObject('test-vectors', matches[0].id);
console.log(result);

ボタンについて聞いてみる。

$ deno run -A --env build-doc.ts Button
Querying for Button
---
title: Button
description: Displays a button or a component that looks like a button.
featured: true
component: true
---

...

ボタン要素の使い方が説明された。

mizchimizchi

embedding vector を生成する方法として、cloudflare ではなく openai api のモデルを使ってみる。

ここで vectorize を作り直した。ベクトル長が openai の text-embedding-3-small は 1536 で、text-embedding-3-large がその2倍の 3072。

$ npx wrangler vectorize create embeddings2 --dimensions=1536 --metric=cosine

ちなみに 3072 の vectorize は生成できなかった。

import OpenAI from "npm:openai@4.47.2";
import { upsertVectors } from "./lib.ts";

const client = new OpenAI({ apiKey: Deno.env.get("OPENAI_API_KEY")! });

const embeddings = await client.embeddings.create({
  model: "text-embedding-3-small",
  input: 'This is a story about an orange cloud',
  encoding_format: 'float',
});

console.log(embeddings.data[0].embedding);
const vector = embeddings.data[0];
const vectors: VectorizeVector = {
  id: "1",
  values: vector.embedding,
};

const res = await upsertVectors([vectors]);
if (!res.success) {
  console.error(res.errors[0].message);
  throw new Error('Failed to upsert vectors');
}
console.log(res);

これで保存できる。

mizchimizchi

一応ローカルでコサイン類似度を計算できるようにしておく。ローカルと vectorize の実行結果が一致するかも確認する。

import OpenAI from "npm:openai@4.47.2";
import { queryVectors } from "./lib.ts";

const client = new OpenAI({ apiKey: Deno.env.get("OPENAI_API_KEY")! });

const embeddings = await client.embeddings.create({
  model: "text-embedding-3-small",
  input: 'This is a story about an orange cloud',
  encoding_format: 'float',
});

console.log(embeddings.data[0].embedding);

const queryEmbeddings = await client.embeddings.create({
  model: "text-embedding-3-small",
  input: 'orange cloud',
  encoding_format: 'float',
});

const similarity = cosineSimilarity(embeddings.data[0].embedding, queryEmbeddings.data[0].embedding);
console.log(similarity);
const x = await queryVectors(queryEmbeddings.data[0].embedding, { topK: 1 });
console.log('calc on cloud', x);

function cosineSimilarity(vecA: number[], vecB: number[]): number {
  if (vecA.length !== vecB.length) {
    throw new Error('Vectors must be of the same length');
  }

  let dotProduct = 0;
  let normA = 0;
  let normB = 0;

  for (let i = 0; i < vecA.length; i++) {
    dotProduct += vecA[i] * vecB[i];
    normA += vecA[i] * vecA[i];
    normB += vecB[i] * vecB[i];
  }

  if (normA === 0 || normB === 0) {
    throw new Error('Norm of a vector is zero, cannot calculate cosine similarity');
  }
  return dotProduct / (Math.sqrt(normA) * Math.sqrt(normB));
}

Result

0.5484223977832671
calc on cloud {
  result: { count: 1, matches: [ { id: "1", score: 0.548422398 } ] },
  result_info: null,
  success: true,
  errors: [],
  messages: []
}