update
This commit is contained in:
54
packages/runtime/src/services/database/database.ts
Normal file
54
packages/runtime/src/services/database/database.ts
Normal file
@@ -0,0 +1,54 @@
|
||||
import knex, { type Knex } from 'knex';
|
||||
import ClientPgLite from 'knex-pglite';
|
||||
import { PGlite } from '@electric-sql/pglite';
|
||||
import { vector } from '@electric-sql/pglite/vector';
|
||||
|
||||
import { migrationSource } from './migrations/migrations.js';
|
||||
|
||||
import { destroy, Services } from '#root/utils/utils.services.js';
|
||||
|
||||
class DatabaseService {
|
||||
#services: Services;
|
||||
#instance?: Promise<Knex>;
|
||||
|
||||
constructor(services: Services) {
|
||||
this.#services = services;
|
||||
}
|
||||
|
||||
#setup = async () => {
|
||||
const pglite = new PGlite({
|
||||
extensions: { vector },
|
||||
});
|
||||
|
||||
const instance = knex({
|
||||
client: ClientPgLite,
|
||||
dialect: 'postgres',
|
||||
connection: () => ({ pglite }) as object,
|
||||
});
|
||||
await instance.raw(`CREATE EXTENSION IF NOT EXISTS vector`);
|
||||
|
||||
await instance.migrate.latest({
|
||||
migrationSource: migrationSource({ services: this.#services }),
|
||||
});
|
||||
|
||||
return instance;
|
||||
};
|
||||
|
||||
public getInstance = () => {
|
||||
if (!this.#instance) {
|
||||
this.#instance = this.#setup();
|
||||
}
|
||||
return this.#instance;
|
||||
};
|
||||
|
||||
[destroy] = async () => {
|
||||
if (!this.#instance) {
|
||||
return;
|
||||
}
|
||||
const instance = await this.#instance;
|
||||
await instance.destroy();
|
||||
};
|
||||
}
|
||||
|
||||
export { type TableRows, tableNames } from './migrations/migrations.js';
|
||||
export { DatabaseService };
|
||||
@@ -0,0 +1,112 @@
|
||||
import type { Migration } from './migrations.types.js';
|
||||
|
||||
import { EmbeddingsService } from '#root/services/embeddings/embeddings.js';
|
||||
import { EMBEDDING_MODEL } from '#root/utils/utils.consts.js';
|
||||
|
||||
const tableNames = {
|
||||
documents: 'documents',
|
||||
documentChunks: 'documentChunks',
|
||||
relations: 'relations',
|
||||
};
|
||||
|
||||
const init: Migration = {
|
||||
name: 'init',
|
||||
up: async ({ knex, services }) => {
|
||||
const embedding = services.get(EmbeddingsService);
|
||||
const embeddingField = await embedding.getFieldType(EMBEDDING_MODEL);
|
||||
|
||||
await knex.schema.createTable(tableNames.documents, (table) => {
|
||||
table.uuid('id').primary();
|
||||
table.uuid('owner').nullable().references('id').inTable(tableNames.documents).onDelete('CASCADE');
|
||||
table.datetime('updatedAt').notNullable();
|
||||
table.datetime('createdAt').notNullable();
|
||||
table.datetime('deletedAt').nullable();
|
||||
table.string('contentType').nullable();
|
||||
table.text('content').nullable();
|
||||
table.string('source').nullable();
|
||||
table.string('sourceId').nullable();
|
||||
table.string('type').notNullable();
|
||||
table.integer('typeVersion').nullable();
|
||||
table.text('searchText').nullable();
|
||||
table.jsonb('metadata').nullable();
|
||||
|
||||
table.index(['source', 'sourceId']);
|
||||
table.index(['owner']);
|
||||
});
|
||||
|
||||
await knex.schema.createTable(tableNames.documentChunks, (table) => {
|
||||
table.uuid('id').primary();
|
||||
table.uuid('owner').nullable().references('id').inTable(tableNames.documents).onDelete('CASCADE');
|
||||
table.text('content').notNullable();
|
||||
table.specificType('embedding', embeddingField).notNullable();
|
||||
table.string('embeddingModel').notNullable();
|
||||
|
||||
table.index(['owner']);
|
||||
});
|
||||
knex.raw(`create index on ${tableNames.documentChunks} using GIN ("embeddingg")`);
|
||||
|
||||
await knex.schema.createTable(tableNames.relations, (table) => {
|
||||
table.uuid('from').notNullable().references('id').inTable(tableNames.documents).onDelete('CASCADE');
|
||||
table.uuid('to').notNullable().references('id').inTable(tableNames.documents).onDelete('CASCADE');
|
||||
table.string('type').nullable();
|
||||
table.string('typeVersion').nullable();
|
||||
table.datetime('updatedAt').notNullable();
|
||||
table.datetime('createdAt').notNullable();
|
||||
table.datetime('deletedAt').nullable();
|
||||
table.jsonb('data');
|
||||
|
||||
table.primary(['from', 'to', 'type']);
|
||||
table.index(['from']);
|
||||
table.index(['to']);
|
||||
});
|
||||
},
|
||||
down: async ({ knex }) => {
|
||||
await knex.schema.dropTableIfExists(tableNames.relations);
|
||||
await knex.schema.dropTableIfExists(tableNames.documentChunks);
|
||||
await knex.schema.dropTableIfExists(tableNames.documents);
|
||||
},
|
||||
};
|
||||
|
||||
type DocumentRow = {
|
||||
id: string;
|
||||
owner: string | null;
|
||||
updatedAt: Date;
|
||||
createdAt: Date;
|
||||
deletedAt: Date | null;
|
||||
contentType: string | null;
|
||||
content: string | null;
|
||||
source: string | null;
|
||||
sourceId: string | null;
|
||||
type: string;
|
||||
typeVersion: number | null;
|
||||
searchText: string | null;
|
||||
metadata: unknown;
|
||||
};
|
||||
|
||||
type DocumentChunkRow = {
|
||||
id: string;
|
||||
owner: string;
|
||||
content: string;
|
||||
embedding: unknown;
|
||||
embeddingModel: string;
|
||||
};
|
||||
|
||||
type RelationRow = {
|
||||
from: string;
|
||||
to: string;
|
||||
type: string;
|
||||
typeVersion: string | null;
|
||||
updatedAt: Date;
|
||||
createdAt: Date;
|
||||
deletedAt: Date | null;
|
||||
data: unknown;
|
||||
};
|
||||
|
||||
type TableRows = {
|
||||
documents: DocumentRow;
|
||||
documentChunks: DocumentChunkRow;
|
||||
replations: RelationRow;
|
||||
};
|
||||
|
||||
export type { TableRows };
|
||||
export { tableNames, init };
|
||||
@@ -0,0 +1,25 @@
|
||||
import type { Knex } from 'knex';
|
||||
|
||||
import type { Migration } from './migrations.types.js';
|
||||
import { init } from './migrations.001-init.js';
|
||||
|
||||
import type { Services } from '#root/utils/utils.services.js';
|
||||
|
||||
const migrations = [init] satisfies Migration[];
|
||||
|
||||
type MigrationSourceOptions = {
|
||||
services: Services;
|
||||
};
|
||||
|
||||
const migrationSource = (options: MigrationSourceOptions): Knex.MigrationSource<Migration> => ({
|
||||
getMigrationName: (migration) => migration.name,
|
||||
getMigration: async (migration) => ({
|
||||
name: migration.name,
|
||||
up: (knex) => migration.up({ ...options, knex }),
|
||||
down: (knex) => migration.down({ ...options, knex }),
|
||||
}),
|
||||
getMigrations: async () => migrations,
|
||||
});
|
||||
|
||||
export { type TableRows, tableNames } from './migrations.001-init.js';
|
||||
export { migrationSource };
|
||||
@@ -0,0 +1,16 @@
|
||||
import type { Knex } from 'knex';
|
||||
|
||||
import type { Services } from '#root/utils/utils.services.js';
|
||||
|
||||
type MigrationOptions = {
|
||||
knex: Knex;
|
||||
services: Services;
|
||||
};
|
||||
|
||||
type Migration = {
|
||||
name: string;
|
||||
up: (options: MigrationOptions) => Promise<void>;
|
||||
down: (options: MigrationOptions) => Promise<void>;
|
||||
};
|
||||
|
||||
export type { Migration };
|
||||
@@ -0,0 +1,13 @@
|
||||
import type { TableRows } from '../database/database.js';
|
||||
|
||||
import type { DocumentChunk } from './document-chunks.schemas.js';
|
||||
|
||||
const mapFromDocumentChunkRow = (
|
||||
row: TableRows['documentChunks'] & {
|
||||
metadata: unknown;
|
||||
},
|
||||
): DocumentChunk => ({
|
||||
...row,
|
||||
});
|
||||
|
||||
export { mapFromDocumentChunkRow };
|
||||
@@ -0,0 +1,33 @@
|
||||
import { z } from 'zod';
|
||||
import { queryFilterSchema } from '@morten-olsen/stash-query-dsl';
|
||||
|
||||
import { createListResultSchema } from '#root/utils/utils.schema.js';
|
||||
|
||||
const documentChunkSchema = z.object({
|
||||
id: z.string(),
|
||||
owner: z.string(),
|
||||
content: z.string(),
|
||||
metadata: z.unknown(),
|
||||
});
|
||||
|
||||
type DocumentChunk = z.infer<typeof documentChunkSchema>;
|
||||
|
||||
const documentChunkFilterSchema = z.object({
|
||||
limit: z.number().default(20),
|
||||
offset: z.number().default(0),
|
||||
semanticText: z.string().optional(),
|
||||
conditions: z.union([queryFilterSchema, z.string()]).optional(),
|
||||
});
|
||||
|
||||
type DocumentChunkFilter = z.infer<typeof documentChunkFilterSchema>;
|
||||
|
||||
const documentChunksFindResultSchema = createListResultSchema(
|
||||
documentChunkSchema.extend({
|
||||
distance: z.number().optional(),
|
||||
}),
|
||||
);
|
||||
|
||||
type DocumentChunksFindResult = z.infer<typeof documentChunksFindResultSchema>;
|
||||
|
||||
export type { DocumentChunk, DocumentChunkFilter, DocumentChunksFindResult };
|
||||
export { documentChunkSchema, documentChunkFilterSchema, documentChunksFindResultSchema };
|
||||
@@ -0,0 +1,66 @@
|
||||
import { QueryParser } from '@morten-olsen/stash-query-dsl';
|
||||
|
||||
import { DatabaseService, tableNames, type TableRows } from '../database/database.js';
|
||||
import { EmbeddingsService } from '../embeddings/embeddings.js';
|
||||
|
||||
import type { DocumentChunkFilter, DocumentChunksFindResult } from './document-chunks.schemas.js';
|
||||
import { mapFromDocumentChunkRow } from './document-chunks.mappings.js';
|
||||
|
||||
import type { Services } from '#root/utils/utils.services.js';
|
||||
import { EMBEDDING_MODEL } from '#root/utils/utils.consts.js';
|
||||
import type { ExplicitAny } from '#root/global.js';
|
||||
import { applyQueryFilter } from '#root/utils/utils.query.js';
|
||||
|
||||
const baseFields = [
|
||||
`${tableNames.documentChunks}.*`,
|
||||
`${tableNames.documents}.metadata`,
|
||||
`${tableNames.documents}.createdAt`,
|
||||
];
|
||||
|
||||
class DocumentChunksService {
|
||||
#services: Services;
|
||||
|
||||
constructor(services: Services) {
|
||||
this.#services = services;
|
||||
}
|
||||
|
||||
public find = async (filter: DocumentChunkFilter): Promise<DocumentChunksFindResult> => {
|
||||
const databaseService = this.#services.get(DatabaseService);
|
||||
const db = await databaseService.getInstance();
|
||||
|
||||
let query = db<TableRows['documentChunks']>(tableNames.documentChunks);
|
||||
query.join(tableNames.documents, `${tableNames.documents}.id`, `${tableNames.documentChunks}.owner`);
|
||||
|
||||
if (filter.semanticText) {
|
||||
const embedding = this.#services.get(EmbeddingsService);
|
||||
const [vector] = await embedding.extract({
|
||||
input: [filter.semanticText],
|
||||
model: EMBEDDING_MODEL,
|
||||
});
|
||||
query = query.select(...baseFields, db.raw(`embedding <=> '${vector.toSql()}' as distance`));
|
||||
query = query.where(`${tableNames.documentChunks}.embeddingModel`, EMBEDDING_MODEL);
|
||||
query = query.orderBy('distance', 'asc');
|
||||
} else {
|
||||
query = query.select(baseFields);
|
||||
query = query.orderBy('createdAt', 'desc');
|
||||
}
|
||||
if (filter.conditions) {
|
||||
const parser = this.#services.get(QueryParser);
|
||||
query = applyQueryFilter(
|
||||
query,
|
||||
typeof filter.conditions === 'string' ? parser.parse(filter.conditions) : filter.conditions,
|
||||
);
|
||||
}
|
||||
|
||||
query = query.limit(filter.limit).offset(filter.offset);
|
||||
|
||||
const items = await query;
|
||||
|
||||
return {
|
||||
items: items.map(mapFromDocumentChunkRow as ExplicitAny),
|
||||
};
|
||||
};
|
||||
}
|
||||
|
||||
export * from './document-chunks.schemas.js';
|
||||
export { DocumentChunksService };
|
||||
12
packages/runtime/src/services/documents/documents.mapping.ts
Normal file
12
packages/runtime/src/services/documents/documents.mapping.ts
Normal file
@@ -0,0 +1,12 @@
|
||||
import type { TableRows } from '../database/database.js';
|
||||
|
||||
import type { Document } from './documents.schemas.js';
|
||||
|
||||
const mapFromDocumentRow = (row: TableRows['documents']): Document => ({
|
||||
...row,
|
||||
createdAt: row.createdAt.toISOString(),
|
||||
updatedAt: row.updatedAt.toISOString(),
|
||||
deletedAt: row.deletedAt?.toISOString() || null,
|
||||
});
|
||||
|
||||
export { mapFromDocumentRow };
|
||||
80
packages/runtime/src/services/documents/documents.schemas.ts
Normal file
80
packages/runtime/src/services/documents/documents.schemas.ts
Normal file
@@ -0,0 +1,80 @@
|
||||
import { z } from 'zod';
|
||||
import { queryFilterSchema } from '@morten-olsen/stash-query-dsl';
|
||||
|
||||
import { createListResultSchema } from '#root/utils/utils.schema.js';
|
||||
|
||||
const documentSchema = z.object({
|
||||
id: z.string(),
|
||||
owner: z.string().nullable(),
|
||||
createdAt: z.iso.datetime(),
|
||||
updatedAt: z.iso.datetime(),
|
||||
deletedAt: z.iso.datetime().nullable(),
|
||||
contentType: z.string().nullable(),
|
||||
content: z.string().nullable(),
|
||||
source: z.string().nullable(),
|
||||
sourceId: z.string().nullable(),
|
||||
type: z.string(),
|
||||
typeVersion: z.int().nullable(),
|
||||
searchText: z.string().nullable(),
|
||||
metadata: z.unknown(),
|
||||
});
|
||||
|
||||
type Document = z.infer<typeof documentSchema>;
|
||||
|
||||
const documentUpsertSchema = z
|
||||
.object({
|
||||
id: z.string().nullish(),
|
||||
owner: z.string().nullish(),
|
||||
contentType: z.string().nullish(),
|
||||
content: z.string().nullish(),
|
||||
source: z.string().nullish(),
|
||||
sourceId: z.string().nullish(),
|
||||
type: z.string().optional(),
|
||||
typeVersion: z.int().nullish(),
|
||||
searchText: z.string().nullish(),
|
||||
metadata: z.unknown().nullish(),
|
||||
})
|
||||
.meta({
|
||||
example: {
|
||||
content: 'the cat is yellow',
|
||||
contentType: 'text/plain',
|
||||
source: 'test',
|
||||
sourceId: 'test',
|
||||
type: 'raw',
|
||||
metadata: {
|
||||
foo: 'bar',
|
||||
bar: 'baz',
|
||||
},
|
||||
},
|
||||
});
|
||||
|
||||
type DocumentUpsert = z.infer<typeof documentUpsertSchema>;
|
||||
|
||||
const documentUpsertResultSchema = z.object({
|
||||
action: z.enum(['inserted', 'updated', 'skipped']),
|
||||
id: z.string(),
|
||||
document: documentSchema,
|
||||
});
|
||||
|
||||
type DocumentUpsertResult = z.infer<typeof documentUpsertResultSchema>;
|
||||
|
||||
const documentFilterSchema = z.object({
|
||||
offset: z.number().default(0),
|
||||
limit: z.number().default(20),
|
||||
condition: z.union([queryFilterSchema, z.string()]),
|
||||
});
|
||||
|
||||
type DocumentFilter = z.infer<typeof documentFilterSchema>;
|
||||
|
||||
const documentFindResultSchema = createListResultSchema(documentSchema);
|
||||
|
||||
type DocumentFindResult = z.infer<typeof documentFindResultSchema>;
|
||||
|
||||
export type { Document, DocumentUpsert, DocumentUpsertResult, DocumentFilter, DocumentFindResult };
|
||||
export {
|
||||
documentSchema,
|
||||
documentUpsertSchema,
|
||||
documentUpsertResultSchema,
|
||||
documentFilterSchema,
|
||||
documentFindResultSchema,
|
||||
};
|
||||
179
packages/runtime/src/services/documents/documents.ts
Normal file
179
packages/runtime/src/services/documents/documents.ts
Normal file
@@ -0,0 +1,179 @@
|
||||
import { QueryParser } from '@morten-olsen/stash-query-dsl';
|
||||
|
||||
import { DatabaseService, tableNames, type TableRows } from '../database/database.js';
|
||||
import { SplittingService } from '../splitter/splitter.js';
|
||||
|
||||
import type {
|
||||
Document,
|
||||
DocumentFilter,
|
||||
DocumentFindResult,
|
||||
DocumentUpsert,
|
||||
DocumentUpsertResult,
|
||||
} from './documents.schemas.ts';
|
||||
import { mapFromDocumentRow } from './documents.mapping.js';
|
||||
|
||||
import { EventEmitter } from '#root/utils/utils.event-emitter.js';
|
||||
import type { Services } from '#root/utils/utils.services.js';
|
||||
import { compareObjectKeys } from '#root/utils/utils.compare.js';
|
||||
import { applyQueryFilter } from '#root/utils/utils.query.js';
|
||||
|
||||
type DocumentsServiceEvents = {
|
||||
upserted: (document: Document) => void;
|
||||
inserted: (document: Document) => void;
|
||||
updated: (document: Document) => void;
|
||||
};
|
||||
|
||||
class DocumentsService extends EventEmitter<DocumentsServiceEvents> {
|
||||
#services: Services;
|
||||
|
||||
constructor(services: Services) {
|
||||
super();
|
||||
this.#services = services;
|
||||
}
|
||||
|
||||
public find = async (filter: DocumentFilter): Promise<DocumentFindResult> => {
|
||||
const databaseService = this.#services.get(DatabaseService);
|
||||
const db = await databaseService.getInstance();
|
||||
let query = db<TableRows['documents']>(tableNames.documents);
|
||||
if (filter) {
|
||||
const parser = this.#services.get(QueryParser);
|
||||
query = applyQueryFilter(
|
||||
query,
|
||||
typeof filter.condition === 'string' ? parser.parse(filter.condition) : filter.condition,
|
||||
);
|
||||
}
|
||||
query = query.limit(filter.limit).offset(filter.offset);
|
||||
const items = await query;
|
||||
return {
|
||||
items: items.map(mapFromDocumentRow),
|
||||
};
|
||||
};
|
||||
|
||||
public get = async (id: string): Promise<Document> => {
|
||||
const databaseService = this.#services.get(DatabaseService);
|
||||
const db = await databaseService.getInstance();
|
||||
const [item] = await db<TableRows['documents']>(tableNames.documents).where('id', id).limit(1);
|
||||
return mapFromDocumentRow(item);
|
||||
};
|
||||
|
||||
public remove = async (id: string): Promise<void> => {
|
||||
const databaseService = this.#services.get(DatabaseService);
|
||||
const db = await databaseService.getInstance();
|
||||
await db<TableRows['documents']>(tableNames.documents).where('id', id).delete();
|
||||
};
|
||||
|
||||
public upsert = async (document: DocumentUpsert): Promise<DocumentUpsertResult> => {
|
||||
const databaseService = this.#services.get(DatabaseService);
|
||||
const db = await databaseService.getInstance();
|
||||
|
||||
const result = await db.transaction(async (trx) => {
|
||||
let id = document.id || crypto.randomUUID();
|
||||
if (document.source && document.sourceId) {
|
||||
const [currentSourceDocument] = await trx<TableRows['documents']>(tableNames.documents)
|
||||
.where('source', document.source)
|
||||
.andWhere('sourceId', document.sourceId)
|
||||
.limit(1);
|
||||
if (currentSourceDocument) {
|
||||
id = currentSourceDocument.id;
|
||||
}
|
||||
}
|
||||
const now = new Date();
|
||||
const [current] = await trx<TableRows['documents']>(tableNames.documents).where('id', id).limit(1);
|
||||
if (current) {
|
||||
if (
|
||||
compareObjectKeys(current, document, [
|
||||
'sourceId',
|
||||
'source',
|
||||
'content',
|
||||
'contentType',
|
||||
'searchText',
|
||||
'type',
|
||||
'typeVersion',
|
||||
'metadata',
|
||||
])
|
||||
) {
|
||||
return {
|
||||
id,
|
||||
action: 'skipped',
|
||||
document: mapFromDocumentRow(current),
|
||||
} as const;
|
||||
}
|
||||
await trx<TableRows['documents']>(tableNames.documents)
|
||||
.update({
|
||||
...document,
|
||||
id,
|
||||
updatedAt: now,
|
||||
})
|
||||
.where('id', id);
|
||||
const resultDocument: Document = mapFromDocumentRow({
|
||||
...current,
|
||||
...document,
|
||||
id,
|
||||
});
|
||||
this.emit('updated', resultDocument);
|
||||
this.emit('upserted', resultDocument);
|
||||
return {
|
||||
id,
|
||||
action: 'updated',
|
||||
document: resultDocument,
|
||||
} as const;
|
||||
} else {
|
||||
await trx<TableRows['documents']>(tableNames.documents).insert({
|
||||
metadata: {},
|
||||
type: 'raw',
|
||||
...document,
|
||||
id,
|
||||
createdAt: now,
|
||||
updatedAt: now,
|
||||
});
|
||||
const resultDocument: Document = mapFromDocumentRow({
|
||||
type: 'raw',
|
||||
owner: null,
|
||||
contentType: null,
|
||||
content: null,
|
||||
source: null,
|
||||
sourceId: null,
|
||||
typeVersion: null,
|
||||
searchText: null,
|
||||
metadata: {},
|
||||
...document,
|
||||
deletedAt: null,
|
||||
id,
|
||||
createdAt: now,
|
||||
updatedAt: now,
|
||||
});
|
||||
this.emit('inserted', resultDocument);
|
||||
this.emit('upserted', resultDocument);
|
||||
return {
|
||||
id,
|
||||
action: 'inserted',
|
||||
document: resultDocument,
|
||||
} as const;
|
||||
}
|
||||
});
|
||||
|
||||
if (result.action !== 'skipped') {
|
||||
await db.transaction(async (trx) => {
|
||||
await trx<TableRows['documentChunks']>(tableNames.documentChunks).delete().where('owner', result.id);
|
||||
const splittingService = this.#services.get(SplittingService);
|
||||
const chunks = await splittingService.chunk(result.document);
|
||||
if (chunks.length > 0) {
|
||||
await trx<TableRows['documentChunks']>(tableNames.documentChunks).insert(
|
||||
chunks.map((chunk) => ({
|
||||
id: crypto.randomUUID(),
|
||||
owner: result.id,
|
||||
content: chunk.content,
|
||||
embedding: chunk.vector.toSql(),
|
||||
embeddingModel: chunk.model,
|
||||
})),
|
||||
);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
return result;
|
||||
};
|
||||
}
|
||||
|
||||
export * from './documents.schemas.js';
|
||||
export { DocumentsService };
|
||||
62
packages/runtime/src/services/embeddings/embeddings.ts
Normal file
62
packages/runtime/src/services/embeddings/embeddings.ts
Normal file
@@ -0,0 +1,62 @@
|
||||
import { pipeline, type FeatureExtractionPipeline } from '@huggingface/transformers';
|
||||
|
||||
import { Vector } from './embeddings.vector.js';
|
||||
|
||||
import type { ExplicitAny } from '#root/global.js';
|
||||
|
||||
type ExtractOptions = {
|
||||
input: string[];
|
||||
model: string;
|
||||
};
|
||||
|
||||
type Extractor = {
|
||||
extractor: FeatureExtractionPipeline;
|
||||
dimensions: number;
|
||||
};
|
||||
|
||||
class EmbeddingsService {
|
||||
#extractors = new Map<string, Promise<Extractor>>();
|
||||
|
||||
#setupExctractor = async (model: string) => {
|
||||
const extractor = await pipeline('feature-extraction', model, {});
|
||||
const { config } = extractor.model;
|
||||
if (!('hidden_size' in config) || typeof config.hidden_size !== 'number') {
|
||||
throw new Error('Invalid model configuration');
|
||||
}
|
||||
return {
|
||||
extractor,
|
||||
dimensions: config.hidden_size,
|
||||
};
|
||||
};
|
||||
|
||||
#getExtractor = async (name: string) => {
|
||||
if (!this.#extractors.has(name)) {
|
||||
this.#extractors.set(name, this.#setupExctractor(name));
|
||||
}
|
||||
const extractor = await this.#extractors.get(name);
|
||||
if (!extractor) {
|
||||
throw new Error('Extractor not found');
|
||||
}
|
||||
|
||||
return extractor;
|
||||
};
|
||||
|
||||
public extract = async (options: ExtractOptions) => {
|
||||
const { input, model } = options;
|
||||
const { extractor, dimensions } = await this.#getExtractor(model);
|
||||
const output = await extractor(input, { pooling: 'cls' });
|
||||
return output.tolist().map((v: ExplicitAny) => new Vector(v, dimensions));
|
||||
};
|
||||
|
||||
public getDimensions = async (model: string) => {
|
||||
const { dimensions } = await this.#getExtractor(model);
|
||||
return dimensions;
|
||||
};
|
||||
|
||||
public getFieldType = async (model: string) => {
|
||||
const dimensions = await this.getDimensions(model);
|
||||
return `vector(${dimensions})`;
|
||||
};
|
||||
}
|
||||
|
||||
export { EmbeddingsService, Vector };
|
||||
@@ -0,0 +1,37 @@
|
||||
import { cos_sim } from '@huggingface/transformers';
|
||||
import { toSql } from 'pgvector';
|
||||
|
||||
class Vector {
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
#value: any;
|
||||
#dimentions: number;
|
||||
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
constructor(value: any, dimentions: number) {
|
||||
this.#value = value;
|
||||
this.#dimentions = dimentions;
|
||||
}
|
||||
|
||||
public get value() {
|
||||
return this.#value;
|
||||
}
|
||||
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
public set value(value: any) {
|
||||
this.#value = value;
|
||||
}
|
||||
|
||||
public get dimentions() {
|
||||
return this.#dimentions;
|
||||
}
|
||||
|
||||
public toSql = () => {
|
||||
return toSql(this.#value);
|
||||
};
|
||||
|
||||
public distanceTo = (other: Vector) => {
|
||||
return cos_sim(this.#value, other.value);
|
||||
};
|
||||
}
|
||||
|
||||
export { Vector };
|
||||
44
packages/runtime/src/services/splitter/splitter.ts
Normal file
44
packages/runtime/src/services/splitter/splitter.ts
Normal file
@@ -0,0 +1,44 @@
|
||||
import { EmbeddingsService } from '../embeddings/embeddings.js';
|
||||
import type { Document } from '../documents/documents.schemas.js';
|
||||
|
||||
import type { Chunk, Splitter } from './splitter.types.js';
|
||||
import { textSplitter } from './splitters/splitters.text.js';
|
||||
|
||||
import type { Services } from '#root/utils/utils.services.js';
|
||||
import { EMBEDDING_MODEL } from '#root/utils/utils.consts.js';
|
||||
|
||||
class SplittingService {
|
||||
#services: Services;
|
||||
#chunkers: Set<Splitter>;
|
||||
|
||||
constructor(services: Services) {
|
||||
this.#services = services;
|
||||
this.#chunkers = new Set();
|
||||
this.addChunkers([textSplitter]);
|
||||
}
|
||||
|
||||
public addChunkers = (splitter: Splitter[]) => {
|
||||
this.#chunkers = this.#chunkers.union(new Set(splitter));
|
||||
};
|
||||
|
||||
public chunk = async (input: Document): Promise<Chunk[]> => {
|
||||
const splitter = this.#chunkers.values().find((splitter) => splitter.match(input));
|
||||
if (!splitter) {
|
||||
return [];
|
||||
}
|
||||
const chunks = await splitter.chunk(input);
|
||||
const embeddingsService = this.#services.get(EmbeddingsService);
|
||||
const vectors = await embeddingsService.extract({
|
||||
input: chunks,
|
||||
model: EMBEDDING_MODEL,
|
||||
});
|
||||
return chunks.map((content, index) => ({
|
||||
content,
|
||||
vector: vectors[index],
|
||||
model: EMBEDDING_MODEL,
|
||||
}));
|
||||
};
|
||||
}
|
||||
|
||||
export * from './splitter.types.js';
|
||||
export { SplittingService };
|
||||
15
packages/runtime/src/services/splitter/splitter.types.ts
Normal file
15
packages/runtime/src/services/splitter/splitter.types.ts
Normal file
@@ -0,0 +1,15 @@
|
||||
import type { Document } from '../documents/documents.schemas.js';
|
||||
import type { Vector } from '../embeddings/embeddings.vector.js';
|
||||
|
||||
type Chunk = {
|
||||
content: string;
|
||||
vector: Vector;
|
||||
model: string;
|
||||
};
|
||||
|
||||
type Splitter = {
|
||||
match: (document: Document) => boolean;
|
||||
chunk: (document: Document) => Promise<string[]>;
|
||||
};
|
||||
|
||||
export type { Chunk, Splitter };
|
||||
@@ -0,0 +1,17 @@
|
||||
import { RecursiveCharacterTextSplitter } from '@langchain/textsplitters';
|
||||
|
||||
import type { Splitter } from '../splitter.types.js';
|
||||
|
||||
const textSplitter: Splitter = {
|
||||
match: (document) => !!document.content,
|
||||
chunk: async (document) => {
|
||||
if (!document.content) {
|
||||
return [];
|
||||
}
|
||||
const splitter = new RecursiveCharacterTextSplitter({ chunkSize: 100, chunkOverlap: 0 });
|
||||
const texts = await splitter.splitText(document.content);
|
||||
return texts;
|
||||
},
|
||||
};
|
||||
|
||||
export { textSplitter };
|
||||
17
packages/runtime/src/services/warmup/warmup.ts
Normal file
17
packages/runtime/src/services/warmup/warmup.ts
Normal file
@@ -0,0 +1,17 @@
|
||||
import { DatabaseService } from '../database/database.js';
|
||||
|
||||
import { Services } from '#root/utils/utils.services.js';
|
||||
|
||||
class WarmupService {
|
||||
#services: Services;
|
||||
|
||||
constructor(services: Services) {
|
||||
this.#services = services;
|
||||
}
|
||||
public ensure = async () => {
|
||||
const databaseService = this.#services.get(DatabaseService);
|
||||
await databaseService.getInstance();
|
||||
};
|
||||
}
|
||||
|
||||
export { WarmupService };
|
||||
Reference in New Issue
Block a user