pub const CREATE_COLLECTIONS_TABLE: &str = r#"
CREATE TABLE IF NOT EXISTS pgml.collections (
id serial8 PRIMARY KEY,
created_at timestamp NOT NULL DEFAULT now(),
name text NOT NULL,
active BOOLEAN DEFAULT TRUE,
project_id int8 NOT NULL REFERENCES pgml.projects ON DELETE CASCADE ON UPDATE CASCADE DEFERRABLE INITIALLY DEFERRED,
sdk_version text,
UNIQUE (name)
);
"#;
pub const PIPELINES_TABLE: &str = r#"
CREATE TABLE IF NOT EXISTS %s (
id serial8 PRIMARY KEY,
name text NOT NULL,
created_at timestamp NOT NULL DEFAULT now(),
active BOOLEAN NOT NULL DEFAULT TRUE,
schema jsonb NOT NULL,
UNIQUE (name)
);
"#;
pub const CREATE_DOCUMENTS_TABLE: &str = r#"
CREATE TABLE IF NOT EXISTS %s (
id serial8 PRIMARY KEY,
created_at timestamp NOT NULL DEFAULT now(),
source_uuid uuid NOT NULL,
version jsonb NOT NULL DEFAULT '{}'::jsonb,
document jsonb NOT NULL,
UNIQUE (source_uuid)
);
"#;
pub const CREATE_SPLITTERS_TABLE: &str = r#"
CREATE TABLE IF NOT EXISTS pgml.splitters (
id serial8 PRIMARY KEY,
created_at timestamp NOT NULL DEFAULT now(),
name text NOT NULL,
parameters jsonb NOT NULL DEFAULT '{}',
project_id int8 NOT NULL REFERENCES pgml.projects ON DELETE CASCADE ON UPDATE CASCADE DEFERRABLE INITIALLY DEFERRED
);
"#;
pub const CREATE_CHUNKS_TABLE: &str = r#"CREATE TABLE IF NOT EXISTS %s (
id serial8 PRIMARY KEY, created_at timestamp NOT NULL DEFAULT now(),
document_id int8 NOT NULL REFERENCES %s ON DELETE CASCADE ON UPDATE CASCADE DEFERRABLE INITIALLY DEFERRED,
chunk_index int8 NOT NULL,
chunk text NOT NULL,
UNIQUE (document_id, chunk_index)
);
"#;
pub const CREATE_EMBEDDINGS_TABLE: &str = r#"
CREATE TABLE IF NOT EXISTS %s (
id serial8 PRIMARY KEY,
created_at timestamp NOT NULL DEFAULT now(),
chunk_id int8 NOT NULL REFERENCES %s ON DELETE CASCADE ON UPDATE CASCADE DEFERRABLE INITIALLY DEFERRED,
embedding vector(%d) NOT NULL,
UNIQUE (chunk_id)
);
"#;
pub const CREATE_CHUNKS_TSVECTORS_TABLE: &str = r#"
CREATE TABLE IF NOT EXISTS %s (
id serial8 PRIMARY KEY,
created_at timestamp NOT NULL DEFAULT now(),
chunk_id int8 NOT NULL REFERENCES %s ON DELETE CASCADE ON UPDATE CASCADE DEFERRABLE INITIALLY DEFERRED,
ts tsvector,
UNIQUE (chunk_id)
);
"#;
pub const CREATE_PIPELINES_SEARCHES_TABLE: &str = r#"
CREATE TABLE IF NOT EXISTS %s (
id serial8 PRIMARY KEY,
created_at timestamp NOT NULL DEFAULT now(),
query jsonb
);
"#;
pub const CREATE_PIPELINES_SEARCH_RESULTS_TABLE: &str = r#"
CREATE TABLE IF NOT EXISTS %s (
id serial8 PRIMARY KEY,
search_id int8 NOT NULL REFERENCES %s ON DELETE CASCADE,
document_id int8 NOT NULL REFERENCES %s ON DELETE CASCADE,
scores jsonb NOT NULL,
rank integer NOT NULL
);
"#;
pub const CREATE_PIPELINES_SEARCH_EVENTS_TABLE: &str = r#"
CREATE TABLE IF NOT EXISTS %s (
id serial8 PRIMARY KEY,
created_at timestamp NOT NULL DEFAULT now(),
search_result int8 NOT NULL REFERENCES %s ON DELETE CASCADE,
event jsonb NOT NULL
);
"#;
pub const CREATE_INDEX: &str = r#"
CREATE INDEX %d IF NOT EXISTS %s ON %s (%d);
"#;
pub const CREATE_INDEX_USING_GIN: &str = r#"
CREATE INDEX %d IF NOT EXISTS %s ON %s USING GIN (%d);
"#;
pub const CREATE_INDEX_USING_HNSW: &str = r#"
CREATE INDEX %d IF NOT EXISTS %s on %s using hnsw (%d) %d;
"#;
pub const INSERT_SEARCH_EVENT: &str = r#"
INSERT INTO %s (search_result, event) VALUES ((SELECT id FROM %s WHERE search_id = $1 AND rank = $2), $3)
"#;
pub const UPSERT_DOCUMENT_AND_MERGE_METADATA: &str = r#"
WITH prev AS (
SELECT id, document FROM %s WHERE source_uuid = ANY({binding_parameter})
) INSERT INTO %s (source_uuid, document, version)
VALUES {values_parameters}
ON CONFLICT (source_uuid) DO UPDATE SET document = %s.document || EXCLUDED.document, version = EXCLUDED.version
RETURNING id, (SELECT document FROM prev WHERE prev.id = %s.id)
"#;
pub const UPSERT_DOCUMENT: &str = r#"
WITH prev AS (
SELECT id, document FROM %s WHERE source_uuid = ANY({binding_parameter})
) INSERT INTO %s (source_uuid, document, version)
VALUES {values_parameters}
ON CONFLICT (source_uuid) DO UPDATE SET document = EXCLUDED.document, version = EXCLUDED.version
RETURNING id, (SELECT document FROM prev WHERE prev.id = %s.id)
"#;
pub const GENERATE_TSVECTORS_FOR_CHUNK_IDS: &str = r#"
INSERT INTO %s (chunk_id, ts)
SELECT
id,
to_tsvector('%d', chunk) ts
FROM
%s
WHERE id = ANY ($1)
ON CONFLICT (chunk_id) DO UPDATE SET ts = EXCLUDED.ts;
"#;
pub const GENERATE_TSVECTORS: &str = r#"
INSERT INTO %s (chunk_id, ts)
SELECT
id,
to_tsvector('%d', chunk) ts
FROM
%s chunks
ON CONFLICT (chunk_id) DO UPDATE SET ts = EXCLUDED.ts;
"#;
pub const GENERATE_EMBEDDINGS_FOR_CHUNK_IDS: &str = r#"
INSERT INTO %s (chunk_id, embedding)
SELECT
unnest(array_agg(id)),
pgml.embed(
inputs => array_agg(chunk),
transformer => $1,
kwargs => $2
)
FROM
%s
WHERE
id = ANY ($3)
ON CONFLICT (chunk_id) DO UPDATE SET embedding = EXCLUDED.embedding
"#;
pub const GENERATE_EMBEDDINGS: &str = r#"
INSERT INTO %s (chunk_id, embedding)
SELECT
id,
pgml.embed(
text => chunk,
transformer => $1,
kwargs => $2
)
FROM
%s
ON CONFLICT (chunk_id) DO UPDATE set embedding = EXCLUDED.embedding;
"#;
pub const GENERATE_CHUNKS_FOR_DOCUMENT_IDS_WITH_SPLITTER: &str = r#"
WITH splitter AS (
SELECT
name,
parameters
FROM
pgml.splitters
WHERE
id = $1
),
new AS (
SELECT
documents.id AS document_id,
pgml.chunk ((
SELECT
name
FROM splitter), %d, (
SELECT
parameters
FROM splitter)) AS chunk_t
FROM
%s AS documents
WHERE
id = ANY ($2)
),
del AS (
DELETE FROM %s chunks
WHERE chunk_index > (
SELECT
MAX((chunk_t).chunk_index)
FROM
new
WHERE
new.document_id = chunks.document_id
GROUP BY
new.document_id)
AND chunks.document_id = ANY (
SELECT
document_id
FROM
new))
INSERT INTO %s (document_id, chunk_index, chunk)
SELECT
new.document_id,
(chunk_t).chunk_index,
(chunk_t).chunk
FROM
new
LEFT OUTER JOIN %s chunks ON chunks.document_id = new.document_id
AND chunks.chunk_index = (chunk_t).chunk_index
WHERE (chunk_t).chunk <> COALESCE(chunks.chunk, '')
ON CONFLICT (document_id, chunk_index)
DO UPDATE SET
chunk = EXCLUDED.chunk
RETURNING
id;
"#;
pub const GENERATE_CHUNKS_FOR_DOCUMENT_IDS: &str = r#"
INSERT INTO %s(
document_id, chunk_index, chunk
)
SELECT
documents.id,
1,
%d
FROM %s documents
LEFT OUTER JOIN %s chunks ON chunks.document_id = documents.id
WHERE documents.%d <> COALESCE(chunks.chunk, '')
AND documents.id = ANY($1)
ON CONFLICT (document_id, chunk_index) DO UPDATE SET chunk = EXCLUDED.chunk
RETURNING id
"#;
pub const GENERATE_CHUNKS_WITH_SPLITTER: &str = r#"
WITH splitter AS (
SELECT
name,
parameters
FROM
pgml.splitters
WHERE
id = $1
),
new AS (
SELECT
documents.id AS document_id,
pgml.chunk ((
SELECT
name
FROM splitter), %d, (
SELECT
parameters
FROM splitter)) AS chunk_t
FROM
%s AS documents
),
del AS (
DELETE FROM %s chunks
WHERE chunk_index > (
SELECT
MAX((chunk_t).chunk_index)
FROM
new
WHERE
new.document_id = chunks.document_id
GROUP BY
new.document_id)
AND chunks.document_id = ANY (
SELECT
document_id
FROM
new))
INSERT INTO %s (document_id, chunk_index, chunk)
SELECT
new.document_id,
(chunk_t).chunk_index,
(chunk_t).chunk
FROM
new
ON CONFLICT (document_id, chunk_index)
DO UPDATE SET
chunk = EXCLUDED.chunk;
"#;
pub const GENERATE_CHUNKS: &str = r#"
INSERT INTO %s (
document_id, chunk_index, chunk
)
SELECT
id,
1,
%d
FROM %s
ON CONFLICT (document_id, chunk_index) DO UPDATE SET chunk = EXCLUDED.chunk
RETURNING id
"#;