Genie Engine JSON configuration string

The following sections contain information that pertain to the format of the JSON configuration string that is supplied to GenieEngineConfig_createFromJson. This JSON configuration can be supplied to genie-t2t-run tool for switching the engine.

Note

Please refer to the example configs contained in the SDK at ${SDK_ROOT}/examples/Genie/configs/.

General configuration schema

The following provides the schema of the JSON configuration format that is provided to GenieEngineConfig_createFromJson. Note that dependencies are not specified in the schema, but are discussed in the following per-backend sections.

{
  "standalone-engine" : {
    "type": "object",
    "properties": {
      "version" : {"type": "integer"},
      "context" : {
        "type": "object",
        "properties": {
          "version" : {"type": "integer"},
          "size": {"type": "integer"},
          "n-vocab": {"type": "integer"},
          "draft-n-vocab": {"type": "integer"},
          "bos-token": {"type": "integer"},
          "eos-token": {"type": "array", "items": {"type": "integer"}},
          "pad-token": {"type": "integer"},
          "n-embd": {"type": "integer"}
        }
      },
      "embedding": {
        "type" : "object",
        "properties": {
          "version": {"type": "integer"},
          "type": {"type": "string", "enum" : ["lut", "callback"]},
          "lut-path": {"type": "string"},
          "size": {"type": "integer"},
          "datatype": {"type": "string"},
          "quant-param": {
            "type": "object",
            "properties": {
              "scale": {"type": "float"},
              "offset": {"type": "float"}
            }
          }
        }
      }
      "engine" : {
        "type": "array",
        "items": {
          "type" : "object",
          "properties": {
            "version" : {"type": "integer"},
            "n-threads" : {"type": "integer"},
            "backend" : {
              "type": "object",
              "properties": {
                "version" : {"type": "integer"},
                "type" : {"type": "string","enum" : ["QnnHtp", "QnnGenAiTransformer"]},
                "QnnHtp" : {
                  "type": "object",
                  "properties": {
                    "version" : {"type": "integer"},
                    "spill-fill-bufsize" : {"type": "integer"},
                    "data-alignment-size" : {"type": "integer"},
                    "use-mmap" : {"type": "boolean"},
                    "mmap-budget" : {"type": "integer"},
                    "poll" : {"type": "boolean"},
                    "pos-id-dim" : {"type": "integer"},
                    "cpu-mask" : {"type": "string"},
                    "kv-dim" : {"type": "integer"},
                    "allow-async-init" : {"type": "boolean"},
                    "enable-graph-switching" : {"type": "boolean"},
                    "skip-lora-validation" : {"type" : "boolean"},
                    "rope-theta" : {"type": "double"}
                  }
                },
                "QnnGenAiTransformer" : {
                  "type": "object",
                  "properties": {
                    "version" : {"type": "integer"},
                    "n-logits" : {"type": "integer"},
                    "n-layer" : {"type": "integer"},
                    "n-embd" : {"type": "integer"},
                    "n-heads" : {"type": "integer"},
                    "kv-quantization" : {"type": "boolean"}
                  }
                },
                "extensions" : {"type": "string"}
              }
            },
            "model" : {
              "type": "object",
              "properties": {
                "version" : {"type": "integer"},
                "type" : {"type": "string","enum":["binary", "library"]},
                "binary" : {
                  "type": "object",
                  "properties": {
                    "version" : {"type": "integer"},
                    "ctx-bins" : {"type": "array", "items": {"type": "string"}}
                  }
                },
                "library" : {
                  "type": "object",
                  "properties": {
                    "version" : {"type": "integer"},
                    "model-bin" : {"type": "string"}
                  }
                },
                "positional-encoding": {
                  "type" : "object",
                  "properties" :{
                    "type": {"type" : "string", "enum" : ["rope", "absolute", "alibi"]},
                    "rope-dim": {"type" : "integer"},
                    "rope-theta": {"type" : "double"},
                    "rope-scaling": {
                      "type" : "object",
                      "properties" :{
                        "rope-type": {"type" : "string", "enum" : ["llama3", "longrope"]},
                        "factor": {"type" : "integer"},
                        "high-freq-factor": {"type" : "integer"},
                        "low-freq-factor": {"type" : "integer"},
                        "original-max-position-embeddings": {"type" : "integer"}
                      }
                    }
                  }
                },
                "draft-token-map" : {"type" : "string"}
              }
            }
          }
        }
      }
    }
  }
}

Option

Applicability

Description

standalone-engine::version

all backends

Version of standalone-engine object that is supported by APIs.(1)

context::version

all backends

Version of context object that is supported by APIs. (1)

context::size

all backends

Context length. Maximum number of tokens to store.

context::n-vocab

all backends

Model vocabulary size.

context::draft-n-vocab

all backends

Draft model vocabulary size.

context::bos-token

all backends

Beginning of sentence token.

context::eos-token

all backends

End of sentence token. Argument passed in as an integer or array of integers

context::eot-token

all backends

End of turn token.

context::n-embd

all backends

Embedding size of the input.

embedding::version

all backends

Version of embedding object that is supported by APIs. (1)

embedding::type

all backends

Type of embedding to use. Supported options: lut, callback

embedding::lut-path

all backends

Path to the look up table for embeddings.

embedding::size

all backends

Size of each token embedding for the model.

embedding::datatype

all backends

Datatype of lut.

quant-param::scale

all backends

Quantization scale of the lut.

quant-param::offset

all backends

Quantization offset of the lut.

engine::version

all backends

Version of engine object that is supported by APIs. (1)

engine::n-threads

all backends

Number of threads to use for KV-cache updates.

backend::version

all backends

Version of backend object that is supported by APIs. (1)

backend::type

all backends

Type of engine like “QnnHtp” for QNN HTP, “QnnGenAiTransformer” for QNN GenAITransformer backend and “QnnGpu” for QNN GPU.

backend::extensions

QNN HTP

Path to backend extensions configuration file.

QnnHtp::version

QNN HTP

Version of QnnHtp object that is supported by APIs. (1)

QnnHtp::spill-fill-bufsize

QNN HTP

Buffer size to pre-allocate for the QNN HTP spill fill. This field depends upon the HTP VTCM memory size. It should be set greater than the spill-fill required by each context binary in the model. Consult the QNN HTP backend documentation in the QAIRT SDK for more details.

QnnHtp::data-alignment-size

QNN HTP

Data will be aligned by rounding up the size to the nearest multiple of alignment number. Typically should be zero.

QnnHtp::use-mmap

QNN HTP

Memory map the context binary files. Typically should be turned on.

QnnHtp::mmap-budget

QNN HTP

Memory map the context binary files in chunks of the given size. Typically should be 25MB.

QnnHtp::poll

QNN HTP

Specify whether to busy-wait on threads.

QnnHtp::pos-id-dim

QNN HTP

Dimension of positional embeddings, usually (kv-dim) / 2.

QnnHtp::cpumask

QNN HTP

CPU affinity mask.

QnnHtp::kv-dim

QNN HTP

Dimension of the KV-cache embedding.

QnnHtp::allow-async-init

QNN HTP

Allow context binaries to be initialized asynchronously if the backend supports it.

QnnHtp::enable-graph-switching

QNN HTP

Enables graph switching for graphs within each context binary.

QnnHtp::skip-lora-validation

QNN HTP

Skips CRC validation when LoRA binary sections are applied. Please refer to QNN HTP documentation for more information.

QnnHtp::rope-theta

QNN HTP

Used to calculate rotary positional encodings.

QnnGenAiTransformer::version

QNN GenAiTransformer

Version of QnnGenAiTransformer object that is supported by APIs. (1)

QnnGenAiTransformer::n-logits

QNN GenAiTransformer

Number of logit vectors that result will have for sampling.

QnnGenAiTransformer::n-layer

QNN GenAiTransformer

Number of decoder layers model is having.

QnnGenAiTransformer::n-embd

QNN GenAiTransformer

Size of embedding vector for each token.

QnnGenAiTransformer::n-heads

QNN GenAiTransformer

Number of heads model is having.

QnnGenAiTransformer::kv-quantization

QNN GenAiTransformer

Quantize KV Cache to Q8_0_32.

model::version

all backends

Version of model object that is supported by APIs. (1)

model::type

all backends

Type of model object “binary” for QNN HTP and “library” for QNN GenAiTransformer.

model::positional-encoding

all backends

Captures positional encoding parameters for a model.

positional-encoding::type

all backends

Type of positional encoding. Supported types are rope, alibi and absolute

positional-encoding::rope-dim

all backends

Dimension of Rope positional embeddings, usually (kv-dim) / 2.

positional-encoding::rope-theta

all backends

Used to calculate rotary position encodings for type rope

rope-scaling::rope-type

all backends

Type of rope scaling. Supported types are llama3 and longrope.

rope-scaling::factor

all backends

Rope scaling factor

rope-scaling::high-freq-factor

all backends

High frequency rope scaling factor.

rope-scaling::low-freq-factor

all backends

Low frequency rope scaling factor.

rope-scaling::original-max- position-embeddings

all backends

Original maximum positional embedding length.

model::draft-token-map

all backends

Map for draft-token-id to target-token-id.

binary::version

QNN HTP

Version of binary object that is supported by APIs. (1)

binary::ctx-bins

QNN HTP

List of serialized model files.

library::version

QNN GenAiTransformer

Version of library object that is supported by APIs. (1)

library::model-bin

QNN GenAiTransformer

Path to model.bin file.

Standalone Engine configuration example for QNN HTP

The following is an example configuration for the standalone-engine for HTP backend.

{
  "standalone-engine" : {
    "version" : 1,
    "embedding": {
      "version": 1,
      "type": "lut",
      "lut-path": "uint8_lut.bin",
      "size": 3072,
      "datatype": "ufixed8",
      "quant-param": {
        "scale":  0.0022299130757649738,
        "offset":   -107.10983274409456
      }
    },
    "context": {
      "version": 1,
      "n-vocab": 128256,
      "draft-n-vocab": 3200,
      "size": 2048,
      "bos-token": 128000,
      "eos-token": [
        128001,
        128008,
        128009
      ],
      "n-embd": 3072,
      "pad-token": 128004
    },
    "engine" :       {
      "version": 1,
      "role": "draft",
      "n-threads": 3,
      "backend": {
        "version": 1,
        "type": "QnnHtp",
        "QnnHtp": {
          "version": 1,
          "spill-fill-bufsize": 0,
          "use-mmap": true,
          "mmap-budget": 0,
          "poll": true,
          "cpu-mask": "0xe0",
          "kv-dim": 128,
          "allow-async-init": false,
          "enable-graph-switching": false
        },
        "extensions": "htp_backend_ext_config.json"
      },
      "model": {
        "version": 1,
        "type": "binary",
        "binary": {
          "version": 1,
          "ctx-bins": [
            "weight_sharing_model_2_of_3.serialized.bin",
            "weight_sharing_model_3_of_3.serialized.bin"
          ]
        },
        "positional-encoding": {
          "type": "rope",
          "rope-dim": 64,
          "rope-theta": 500000,
          "rope-scaling": {
            "factor": 32,
            "high-freq-factor": 4,
            "low-freq-factor": 1,
            "original-max-position-embeddings": 8192,
            "rope-type": "llama3"
          }
        },
        "draft-token-map" : "vocab_trim_elementary16181.json"
      }
    }
  }
}

An example of a standalone-engine configuration for HTP can be found at ${SDK_ROOT}/examples/Genie/configs/standalone-engine.json.