gline-rs 1.0.0 - Docs.rs

#set heading(numbering: "1.")
#show link: underline

#place(
  top + center, float: true, scope: "parent",
  text(1.8em, weight: "bold")[Matrix-Level Documentation of \ `gline-rs` Processing Steps]  
)

#place(
  top + center, float: true, scope: "parent",
  text[Frédérik Bilhaut]
)

#v(20pt)

This documents aims at providing a matrix-level description of the pipeline needed for GLiNER inferences, as implemented by #link("https://github.com/fbilhaut/gline-rs")[`gline-rs`].

Concrete examples are provided for each step, all of which build on the input given in the first one.

#v(20pt)
#outline(depth: 2)

#pagebreak()
= Pre-Processing (Common)

== Text Input

This is the "user" input for the whole processing.

=== Source Code

- Related struct: `gliner::model::input::text::TextInput`

=== Format

- $n$: number of input texts
- $k$: number of entity class labels
- $I$: sequence of input texts matrix of type `string` and size $n$
- $E$: entity class labels matrix, of type `string` and size $k$

#align(left, block($ 
I =
  mat(delim: "[",
  "text"_1;
  "text"_2;
  dots.v;
  "text"_n
)
$))

#align(left, block($
E = mat(delim: "[",
  "label"_1;
  "label"_2;
  dots.v;
  "label"_k
)
$))


=== Example

#align(left, block($
I =
  mat(delim: "[",
  &"\"My name is James Bond\"";
  &"\"I like to drive my Aston Martin\"";  
)
$))

#align(left, block($
E = mat(delim: "[",
  &"\"movie character\"";
  &"\"vehicle\"";
)
$))

#pagebreak()
== Word-Level Tokenization

=== Transformation

#align(left, block($(I, E) -> (T, E)$))

=== Source Code

- Struct: `gliner::model::input::tokenized::TokenizedInput`
- Transformation: `gliner::model::input::prompt::RawToTokenized`

=== Format

- $n, k$:  same as before
- $T$: sequence of sequence of tokenized input texts, of type `string` and size $n$
- $E$: same as before

#align(left, block($
T = mat(delim: "[",
  mat(delim: "[", "token"_"1,1", "token"_"1,2", dots);
  mat(delim: "[", "token"_"2,1", "token"_"2,2", dots);
  dots.v;
  mat(delim: "[", "token"_"n,1", "token"_"n,2", dots);
)
$))

=== Example

#align(left, block($
T = mat(delim: "[",
  &mat(delim: "[", "\"My\"" "\"name\"", "\"is\"", "\"James\"", "\"Bond\"");  
  &mat(delim: "[", "\"I\"" "\"like\"", "\"to\"", "\"drive\"", "\"my\"", "\"Aston\"", "\"Martin\"");  
)
$))

#pagebreak()
== Prompt Preparation

Prepared prompts, appending entity and text tokens.

=== Transformation

#align(left, block($(T, E) -> P$))

=== Source Code

- Struct: `gliner::model::input::prompt::PromptInput`
- Transformation from `TokenizedInput`: `gliner::model::input::prompt::TokenizedToPrompt`

=== Format

#align(left, block($
P = mat(delim: "[",
  mat(delim: "[", "<<ENT>>", "label"_"1,1", "<<ENT>>", "label"_"1,2", dots, "<<SEP>>", "token"_"1,1", , "token"_"1,2", dots);
  mat(delim: "[", "<<ENT>>", "label"_"2,1", "<<ENT>>", "label"_"2,2", dots, "<<SEP>>", "token"_"2,1", , "token"_"2,2", dots);
  dots.v ;
  mat(delim: "[", "<<ENT>>", "label"_"n,1", "<<ENT>>", "label"_"n,2", dots, "<<SEP>>", "token"_"n,1", , "token"_"n,2", dots)
)
$))

=== Example

#align(left, block($
P = mat(delim: "[",
  &mat(delim: "[", "<<ENT>>", "\"movie character\"", "<<ENT>>", "\"vehicle\"", dots, "<<SEP>>", "\"My\"", "\"name\"", "\"is\"", "\"James\"", "\"Bond\"");
  &mat(delim: "[", "<<ENT>>", "\"movie character\"", "<<ENT>>", "\"vehicle\"", dots, "<<SEP>>", "\"I\"", "\"like\"", "\"to\"", "\"drive\"", "\"my\"", "\"Austin\"", "\"Martin\"");
)
$))


#pagebreak()
== Prompt Encoding (Sub-Word Tokenization)

=== Transformation

#align(left, block($P -> (I, A, W, L)$))

=== Source Code

- Struct: `gliner::model::input::encoded::EncodedPrompt`
- Transformation: `gliner::model::input::encoded::PromptsToEncoded`

=== Format

#let ststart(x) = text(fill: green, $#x$)
#let stend(x) = text(fill: red, $#x$)
#let stent(x) = text(fill: orange, $#x$)

- k: maximum number of sub-word tokens within a sequence, adding start ($ststart(1)$) and end ($stend(2)$) tokens
- I: encoded prompts of type `i64` and shape $(n*k)$
- A: attention masks of type `i64` and shape $(n*k)$
- W: word masks of type `i64` and shape $(n*k)$
- L: text lengths of type `i64` and shape $(n*1)$

#align(left, block($
I = mat(
  "token_id"_"1,1", "token_id"_"1,2", dots, "token_id"_"1,k" ;
  "token_id"_"2,1", "token_id"_"2,2", dots, "token_id"_"2,k" ;
  dots.v, dots.v, dots.down, dots.v ;
  "token_id"_"n,1", "token_id"_"n,2", dots, "token_id"_"n,k" ;  
)
$))

#align(left, block(
$
A = mat(
  "attn_mask"_"1,1", "attn_mask"_"1,2", dots, "attn_mask"_"1,k" ;
  "attn_mask"_"2,1", "attn_mask"_"2,2", dots, "attn_mask"_"2,k" ;
  dots.v, dots.v, dots.down, dots.v ;
  "attn_mask"_"n,1", "attn_mask"_"n,2", dots, "attn_mask"_"n,k" ;  
)
$))

#align(left, block($
W = mat(
  "word_mask"_"1,1", "word_mask"_"1,2", dots, "word_mask"_"1,k" ;
  "word_mask"_"2,1", "word_mask"_"2,2", dots, "word_mask"_"2,k" ;
  dots.v, dots.v, dots.down, dots.v ;
  "word_mask"_"n,1", "word_mask"_"n,2", dots, "word_mask"_"n,k" ;  
)
$))

#align(left, block($
L = mat(
  "l"_"1"; 
  dots.v;
  "l"_"n";
)
$))

=== Example

#align(left, block($
I = mat(
  ststart(1), stent(128002), 1421, 1470, stent(128002), 1508, stent(128003), 573, 601, 269, 1749, 8728, stend(2), 0, 0;
  ststart(1), stent(128002), 1421, 1470, stent(128002), 1508, stent(128003), 273, 334, 264, 1168, 312, 20844, 2963, stend(2);
)
$))

#align(left, block($
A = mat(
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0;
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1;
)
$))

#align(left, block($
W = mat(
  0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 0, 0, 0;
  0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 0;
)
$))

#align(left, block($
L = mat(
  5;
  7
)
$))

#pagebreak()
= Pre-Processing (Span Mode)

Downstream of the aforementioned steps.

== Span Preparation

=== Transformation

#align(left, block($(I, A, W, L) -> (I, A, W, L, S_I, S_M)$))

=== Format

- $n, k, I, A, W, L$: same as before.
- $s$: maximum possible number of spans for one sequence
- $S_I$: span offsets, of type `i64` and shape $(n*s*2)$
- $S_M$: span masks, of type `bool` and shape $(n*s)$

#align(left, block($
S_I = mat(
  mat("start"_"1,1", "end"_"1,1"), mat("start"_"1,2", "end"_"1,2"), dots, mat("start"_"1,s", "end"_"1,s");
  mat("start"_"2,1", "end"_"2,1"), mat("start"_"2,2", "end"_"2,2"), dots, mat("start"_"2,s", "end"_"2,s");
  dots.v, dots.v, dots.down, dots.v;
  mat("start"_"n,1", "end"_"n,1"), mat("start"_"n,2", "end"_"n,2"), dots, mat("start"_"n,s", "end"_"n,s");
)
$))

#align(left, block($
S_M = mat(
  "span_mask"_"1,1", "span_mask"_"1,2", dots, "span_mask"_"1,s";
  "span_mask"_"2,1", "span_mask"_"2,2", dots, "span_mask"_"2,s";
  dots.v, dots.v, dots.down, dots.v;
  "span_mask"_"n,1", "span_mask"_"n,2", dots, "span_mask"_"n,s";
)
$))

=== Example

Note: for readability purposes, inside matrices are split into rows (one per token) but they are actually in one dimension $s$ (see format above).

#align(left, block($
S_I = mat(
  mat(
    mat(0, 0), mat(0, 1), mat(0, 2), mat(0, 3), mat(0, 4), mat(0, 0), mat(0, 0), mat(0, 0), mat(0, 0), mat(0, 0), mat(0, 0), mat(0, 0), ⏎; 
    mat(1, 1), mat(1, 2), mat(1, 3), mat(1, 4), mat(0, 0), dots, dots, dots, dots, dots, dots, dots, ⏎;
    mat(2, 2), mat(2, 3), mat(2, 4), mat(0, 0), dots, dots, dots, dots, dots, dots, dots, dots, ⏎;
    mat(3, 3), mat(3, 4), mat(0, 0), dots, dots, dots, dots, dots, dots, dots, dots, dots, ⏎;
    mat(4, 4), mat(0, 0), dots, dots, dots, dots, dots, dots, dots, dots, dots, dots, ⏎;
    mat(0, 0), dots, dots, dots, dots, dots, dots, dots, dots, dots, dots, dots, ⏎;
    mat(0, 0), dots, dots, dots, dots, dots, dots, dots, dots, dots, dots, dots
  )
  ;
  mat(
    mat(0, 0), mat(0, 1), mat(0, 2), mat(0, 3), mat(0, 4), mat(0, 5), mat(0, 6), mat(0, 0), mat(0, 0), mat(0, 0), mat(0, 0), mat(0, 0), ⏎;
    mat(1, 1), mat(1, 2), mat(1, 3), mat(1, 4), mat(1, 5), mat(1, 6), mat(0, 0), dots, dots, dots, dots, dots, ⏎;
    mat(2, 2), mat(2, 3), mat(2, 4), mat(2, 5), mat(2, 6), mat(0, 0), dots, dots, dots, dots, dots, dots, ⏎;
    mat(3, 3), mat(3, 4), mat(3, 5), mat(3, 6), mat(0, 0), dots, dots, dots, dots, dots, dots, dots, ⏎;
    mat(4, 4), mat(4, 5), mat(4, 6), mat(0, 0), dots, dots, dots, dots, dots, dots, dots, dots, ⏎;
    mat(5, 5), mat(5, 6), mat(0, 0), dots, dots, dots, dots, dots, dots, dots, dots, dots, ⏎;
    mat(6, 6), mat(0, 0), dots, dots, dots, dots, dots, dots, dots, dots, dots, dots
  )
)
$))

#align(left, block($
S_M = mat(
  mat(
    1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, ⏎; 
    1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, ⏎;
    1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ⏎;
    1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ⏎;
    1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ⏎;
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ⏎;
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
  )
  ;
  mat(
    1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, ⏎;
    1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, ⏎;
    1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, ⏎;
    1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, ⏎;
    1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ⏎;
    1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ⏎;
    1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0; 
  )
)
$))

#pagebreak()
= Pre-Processing (Token Mode)

Nothing to be done beside the common steps.

#pagebreak()
= Post-Processing (Span Mode)

== Logits Output

=== Source Code

- Struct: `gliner::model::output::TensorOutput`

=== Format

- $n$: number of text sequences
- $w$: maximum number of tokens in one sequence
- $s$: maximum number of possible spans for one token (seee above)
- $k$: number of entity labels
- $O$: logits output, of type `f32` and shape $(n*w*s*k)$
- $v_"n,w,s,k"$: raw model output for sequence $n$, token $w$, span $s$ and label $k$.

#align(left, block($
O = mat(
  mat(
    mat(
      mat("v"_"1,1,1,1", dots, "v"_"1,1,1,k");
      dots.v;
      mat("v"_"1,1,s,1", dots, "v"_"1,1,s,k");
    ), 
    dots,
    mat(
      mat("v"_"1,w,1,1", dots, "v"_"1,w,1,k");
      dots.v;
      mat("v"_"1,w,s,1", dots, "v"_"1,w,s,k");
    )
  );
  dots.v;
  mat(
    mat(
      mat("v"_"n,1,1,1", dots, "v"_"n,1,1,k");
      dots.v;
      mat("v"_"n,1,s,1", dots, "v"_"n,1,s,k");
    ), 
    dots,
    mat(
      mat("v"_"n,w,1,1", dots, "v"_"n,w,1,k");
      dots.v;
      mat("v"_"n,w,s,1", dots, "v"_"n,w,s,k");
    )
  )
)
$))


=== Example

In this case $s=12$. For readability purposes, the raw values are "sigmoided" ($S(x)= 1/(1+e^(-x))$) and then "ReLUed" with a threshold $t=0.5$.


#align(left, block($
O_"S,t" = mat(
  mat(
      mat(0, 0), mat(0, 0), mat(0, 0), mat(0, 0), mat(0, 0), mat(0, 0), mat(0, 0), mat(0, 0), mat(0, 0), mat(0, 0), mat(0, 0), mat(0, 0);
      mat(0, 0), mat(0, 0), mat(0, 0), mat(0, 0), mat(0, 0), mat(0, 0), mat(0, 0), mat(0, 0), mat(0, 0), mat(0, 0), mat(0, 0), mat(0, 0);
      mat(0, 0), mat(0, 0), mat(0, 0), mat(0, 0), mat(0, 0), mat(0, 0), mat(0, 0), mat(0, 0), mat(0, 0), mat(0, 0), mat(0, 0), mat(0, 0);
      mat(0, 0), mat(bold(0.89), 0), mat(0, 0), mat(0, 0), mat(0, 0), mat(0, 0), mat(0, 0), mat(0, 0), mat(0, 0), mat(0, 0), mat(0, 0), mat(0, 0);
      mat(0, 0), mat(0, 0), mat(0, 0), mat(0, 0), mat(0, 0), mat(0, 0), mat(0, 0), mat(0, 0), mat(0, 0), mat(0, 0), mat(0, 0), mat(0, 0);
      mat(0, 0), mat(0, 0), mat(0, 0), mat(0, 0), mat(0, 0), mat(0, 0), mat(0, 0), mat(0, 0), mat(0, 0), mat(0, 0), mat(0, 0), mat(0, 0);
      mat(0, 0), mat(0, 0), mat(0, 0), mat(0, 0), mat(0, 0), mat(0, 0), mat(0, 0), mat(0, 0), mat(0, 0), mat(0, 0), mat(0, 0), mat(0, 0);      
  );
  mat(
      mat(0, 0), mat(0, 0), mat(0, 0), mat(0, 0), mat(0, 0), mat(0, 0), mat(0, 0), mat(0, 0), mat(0, 0), mat(0, 0), mat(0, 0), mat(0, 0);
      mat(0, 0), mat(0, 0), mat(0, 0), mat(0, 0), mat(0, 0), mat(0, 0), mat(0, 0), mat(0, 0), mat(0, 0), mat(0, 0), mat(0, 0), mat(0, 0);
      mat(0, 0), mat(0, 0), mat(0, 0), mat(0, 0), mat(0, 0), mat(0, 0), mat(0, 0), mat(0, 0), mat(0, 0), mat(0, 0), mat(0, 0), mat(0, 0);
      mat(0, 0), mat(0, 0), mat(0, 0), mat(0, 0), mat(0, 0), mat(0, 0), mat(0, 0), mat(0, 0), mat(0, 0), mat(0, 0), mat(0, 0), mat(0, 0);
      mat(0, 0), mat(0, 0), mat(0, 0), mat(0, 0), mat(0, 0), mat(0, 0), mat(0, 0), mat(0, 0), mat(0, 0), mat(0, 0), mat(0, 0), mat(0, 0);      
      mat(0, 0), mat(0, bold(0.96)), mat(0, 0), mat(0, 0), mat(0, 0), mat(0, 0), mat(0, 0), mat(0, 0), mat(0, 0), mat(0, 0), mat(0, 0), mat(0, 0);
      mat(0, 0), mat(0, 0), mat(0, 0), mat(0, 0), mat(0, 0), mat(0, 0), mat(0, 0), mat(0, 0), mat(0, 0), mat(0, 0), mat(0, 0), mat(0, 0);      
  );
)
$))

Which means:
- In the 1st sequence, the span starting with the 4th token and ending with the 5th one has a probability of 0.89 to match the 1st entity class.
- In the 2nd sequence, the span starting with the 6th token and ending with the 7th one has a probability of 0.96 to match the second 2nd class.

#pagebreak()
== Span Decoding

=== Transformation

$(O, L) -> S$

=== Source Code

- Struct: `gliner::model::output::decoded::SpanOutput`
- Transformation: `gliner::model::output::decoded::span::TensorsToDecoded`

=== Format

- $t$: threshold
- $n$: number of input sequences
- $L$: text lengths as defined before
- $S$: sequence of spans $(i,j,k,p)$ where: 
  - $i$ is the index of the first token of sequence $m$ with $i<j$ and $i<L(m)$ 
  - $j$ is the index of the last token with the same constraints as $i$
  - $k$ is the entity class, 
  - $p$ is the probability for class $k$ with $p>=t$

#align(left, block($ 
S =
  mat(delim: "[",
    mat(delim: "[", (i_"1,1", j_"1,1", k_"1,1", p_"1,1"), (i_"1,2", j_"1,2", k_"1,2", p_"1,2"), dots);
    dots.v;
    mat(delim: "[", (i_"n,1", j_"n,1", k_"n,1", p_"n,1"), (i_"n,2", j_"n,2", k_"n,2", p_"n,2"), dots);
)
$))

=== Example

#align(left, block($ 
S =
  mat(delim: "[",
    mat(delim: "[", (4, 5, 1, 0.89));
    mat(delim: "[", (6, 7, 2, 0.96));
)
$))

#pagebreak()
= Post-Processing (Token Mode)

== Logits Output

=== Source Code

- Struct: `gliner::model::output::TensorOutput`

=== Format


- $n$: number of text sequences
- $w$: maximum number of tokens in one sequence
- $k$: number of entity labels
- $O$: logits output, of type `f32` and shape $(3*n*w*k)$ with:
  - $s_"n,w,k"$: raw model output for a start token $w$ in sequence $n$ and label $k$.
  - $e_"n,w,k"$: raw model output for an end token $w$ in sequence $n$ and label $k$.
  - $i_"n,w,k"$: raw model output for an inside token $w$ in sequence $n$ and label $k$.

#align(left, block($
O = mat(
  mat(
    mat(
      "s"_"1,1,1", dots, "s"_"1,1,k";
      dots.v, dots.down, dots.v;
      "s"_"1,w,1", dots, "s"_"1,w,k";
    ), 
    dots,
    mat(
      "s"_"n,1,1", dots, "s"_"n,1,k";
      dots.v, dots.down, dots.v;
      "s"_"n,w,1", dots, "s"_"n,w,k";
    ), 
  );
  mat(
    mat(
      "e"_"1,1,1", dots, "e"_"1,1,k";
      dots.v, dots.down, dots.v;
      "e"_"1,w,1", dots, "e"_"1,w,k";
    ), 
    dots,
    mat(
      "e"_"n,1,1", dots, "e"_"n,1,k";
      dots.v, dots.down, dots.v;
      "e"_"n,w,1", dots, "e"_"n,w,k";
    ), 
  );
  mat(
    mat(
      "i"_"1,1,1", dots, "i"_"1,1,k";
      dots.v, dots.down, dots.v;
      "i"_"1,w,1", dots, "i"_"1,w,k";
    ), 
    dots,
    mat(
      "i"_"n,1,1", dots, "i"_"n,1,k";
      dots.v, dots.down, dots.v;
      "i"_"n,w,1", dots, "i"_"n,w,k";
    ), 
  );
)
$))

=== Example

For readability purposes, the raw values are "sigmoided" ($S(x)= 1/(1+e^(-x))$) and then "ReLUed" with a threshold $t=0.5$.

#align(left, block($
O_"S,t" = mat(
  mat(
    mat(0, 0; 0, 0; 0, 0; bold(0.97), 0; 0, 0; 0, 0; 0, 0),
    mat(0, 0; 0, 0; 0, 0; 0, 0; 0, 0; 0, bold(0.99); 0, 0), 
  );
  mat(
    mat(0, 0; 0, 0; 0, 0; 0, 0; bold(0.96), 0; 0, 0; 0, 0),
    mat(0, 0; 0, 0; 0, 0; 0, 0; 0, 0; 0, 0; 0, bold(0.97)),
  );
  mat(
    mat(0, 0; 0, 0; 0, 0; bold(0.98), 0; bold(0.98), 0; 0, 0; 0, 0),
    mat(0, 0; 0, 0; 0, 0; 0, 0; 0, 0; 0, bold(0.99); 0, bold(0.99)),
  );
)
$))

#pagebreak()
== Span Decoding

=== Transformation

$O -> S$

=== Source Code

- Struct: `gliner::model::output::decoded::SpanOutput`
- Transformation: `gliner::model::output::decoded::token::TensorsToDecoded`

=== Format

Same format as in span-mode.

#pagebreak()
= Post-Processing (Common)

== Span Filtering (Greedy Search)

=== Transformation

$S -> S'$

=== Source Code

- Struct: `gliner::model::output::decoded::SpanOutput`
- Transformation: `gliner::model::output::decoded::greedy::GreedySearch`

=== Format

Same as span output.