Skip to content

llmcompressor.modifiers.quantization.calibration

Functions:

calibrate_activations

calibrate_activations(
    module: Module, value: Tensor, base_name: str
)

Calibrate input or output activations by calling the a module's attached observer.

Parameters:

  • module

    (Module) –

    torch.nn.Module

  • base_name

    (str) –

    substring used to fetch the observer, scales, and zp

  • value

    (Tensor) –

    torch.Tensor to be passed to the observer

Source code in llmcompressor/modifiers/quantization/calibration.py
def calibrate_activations(module: Module, value: torch.Tensor, base_name: str):
    """
    Calibrate input or output activations by calling the a module's attached
    observer.

    :param module: torch.nn.Module
    :param base_name: substring used to fetch the observer, scales, and zp
    :param value: torch.Tensor to be passed to the observer

    """
    # If empty tensor, can't update zp/scale
    # Case for MoEs
    if value.numel() == 0:
        return

    field_name = "input" if base_name != "output" else "output"  # input,q,k,v,output
    args_attr = f"quantization_scheme.{field_name}_activations"
    quantization_args = getattr_chain(module, args_attr, None)

    calculate_qparams = True
    calculate_gparam = False

    if quantization_args is not None:
        if quantization_args.dynamic in (True, DynamicType.LOCAL):
            calculate_qparams = False
        if quantization_args.strategy == QuantizationStrategy.TENSOR_GROUP:
            calculate_gparam = True

    call_observer(
        module=module,
        base_name=base_name,
        value=value,
        should_calculate_gparam=calculate_gparam,
        should_calculate_qparams=calculate_qparams,
    )

calibrate_input_hook

calibrate_input_hook(module: Module, args: Any)

Hook to calibrate input activations. Will call the observers to update the scales/zp before applying input QDQ in the module's forward pass.

Source code in llmcompressor/modifiers/quantization/calibration.py
def calibrate_input_hook(module: Module, args: Any):
    """
    Hook to calibrate input activations.
    Will call the observers to update the scales/zp before applying
    input QDQ in the module's forward pass.
    """
    args = args[0] if isinstance(args, tuple) else args
    calibrate_activations(module, value=args, base_name="input")

calibrate_output_hook

calibrate_output_hook(
    module: Module, _args: Any, output: Tensor
)

Hook to calibrate output activations. Will call the observers to update the scales/zp before applying output QDQ.

Source code in llmcompressor/modifiers/quantization/calibration.py
def calibrate_output_hook(module: Module, _args: Any, output: torch.Tensor):
    """
    Hook to calibrate output activations.
    Will call the observers to update the scales/zp before applying
    output QDQ.
    """
    calibrate_activations(
        module,
        value=output,
        base_name="output",
    )
    output = forward_quantize(
        module=module,
        value=output,
        base_name="output",
        args=module.quantization_scheme.output_activations,
    )
    return output

call_observer

call_observer(
    module: Module,
    base_name: str,
    value: Optional[Tensor] = None,
    should_calculate_gparam: bool = False,
    should_calculate_qparams: bool = True,
)

Call a module's attached input/weight/output observer using a provided value. Update the module's scale and zp using the observer's return values.

Parameters:

  • module

    (Module) –

    torch.nn.Module

  • base_name

    (str) –

    substring used to fetch the observer, scales, and zp

  • value

    (Optional[Tensor], default: None ) –

    torch.Tensor to be passed to the observer for activations. If base_name is "weight", then the module's weight tensor will be used

Source code in llmcompressor/modifiers/quantization/calibration.py
def call_observer(
    module: Module,
    base_name: str,
    value: Optional[torch.Tensor] = None,
    should_calculate_gparam: bool = False,
    should_calculate_qparams: bool = True,
):
    """
    Call a module's attached input/weight/output observer using a provided value.
    Update the module's scale and zp using the observer's return values.

    :param module: torch.nn.Module
    :param base_name: substring used to fetch the observer, scales, and zp
    :param value: torch.Tensor to be passed to the observer for activations. If
        base_name is "weight", then the module's weight tensor will be used
    """
    with align_module_device(module):
        if value is None and base_name == "weight":
            value = module.weight
        observer: Observer = getattr(module, f"{base_name}_observer")

        if should_calculate_gparam:
            global_scale = observer.get_global_scale(value)
            update_offload_parameter(module, f"{base_name}_global_scale", global_scale)

        if should_calculate_qparams:
            scale, zero_point = observer(value)
            update_offload_parameter(module, f"{base_name}_scale", scale)
            if hasattr(module, f"{base_name}_zero_point"):
                update_offload_parameter(module, f"{base_name}_zero_point", zero_point)

freeze_module_quantization

freeze_module_quantization(module: Module)

deletes observers when calibration is complete.

apply to full model with model.apply(freeze_module_quantization)

Parameters:

  • module

    (Module) –

    module to freeze quantization for

Source code in llmcompressor/modifiers/quantization/calibration.py
def freeze_module_quantization(module: Module):
    """
    deletes observers when calibration is complete.

    apply to full model with `model.apply(freeze_module_quantization)`

    :param module: module to freeze quantization for
    """
    scheme = getattr(module, "quantization_scheme", None)
    if not scheme:
        # no quantization scheme nothing to do
        return

    if module.quantization_status == QuantizationStatus.FROZEN:
        # nothing to do, already frozen
        return

    # remove observers
    for name in ("input", "weight", "output", "q", "k", "v"):
        obs_name = f"{name}_observer"
        if hasattr(module, obs_name):
            delattr(module, obs_name)

    module.quantization_status = QuantizationStatus.FROZEN

initialize_observer

initialize_observer(module: Module, base_name: str)

Initialize observer module and attach as submodule. The name of the observer is fetched from the quantization_args. The name is then used to load the observer from the registry and attached to the module. The name of the observer uses the base_name provided.

This function always initializes memoryless observers for weights

Parameters:

  • module

    (Module) –

    torch.nn.Module that the observer is being attached to

  • base_name

    (str) –

    str used to name the observer attribute

Source code in llmcompressor/modifiers/quantization/calibration.py
def initialize_observer(
    module: Module,
    base_name: str,
):
    """
    Initialize observer module and attach as submodule.
    The name of the observer is fetched from the quantization_args.
    The name is then used to load the observer from the registry and attached
    to the module. The name of the observer uses the base_name provided.

    This function always initializes memoryless observers for weights

    :param module: torch.nn.Module that the observer is being attached to
    :param base_name: str used to name the observer attribute

    """
    if base_name == "weight":
        arg_name = "weights"
    elif base_name == "output":
        arg_name = "output_activations"
    else:  # input, q, k, v
        arg_name = "input_activations"

    args: QuantizationArgs = getattr_chain(
        module, f"quantization_scheme.{arg_name}", None
    )
    observer = args.observer

    # training is no longer supported: always use memoryless for weights
    if base_name == "weight" and args.observer in ("static_minmax", "minmax"):
        observer = "memoryless_minmax"
        logger.warning(
            "Overriding weight observer for lower memory usage "
            f"({args.observer} -> {observer})",
            log_once=True,
        )
    if base_name == "weight" and args.observer in ("mse",):
        observer = "memoryless_mse"
        logger.warning(
            "Overriding weight observer for lower memory usage "
            f"({args.observer} -> {observer})",
            log_once=True,
        )

    if args is not None and args.dynamic is not True:
        observer = Observer.load_from_registry(
            observer, base_name=base_name, args=args, module=module
        )
        module.register_module(f"{base_name}_observer", observer)

update_weight_zp_scale

update_weight_zp_scale(module: Module)

marks a layer as ready for calibration which activates observers to update scales and zero points on each forward pass

apply to full model with model.apply(update_weight_zp_scale)

Parameters:

  • module

    (Module) –

    module to set for calibration

  • quantize_weights_upfront

    whether to automatically run weight quantization at the start of calibration

Source code in llmcompressor/modifiers/quantization/calibration.py
def update_weight_zp_scale(module: Module):
    """
    marks a layer as ready for calibration which activates observers
    to update scales and zero points on each forward pass

    apply to full model with `model.apply(update_weight_zp_scale)`

    :param module: module to set for calibration
    :param quantize_weights_upfront: whether to automatically
       run weight quantization at the start of calibration
    """
    if getattr_chain(module, "quantization_scheme.weights", None) is None:
        return

    if getattr(module, "quantization_status", None) != QuantizationStatus.CALIBRATION:
        logger.warning(
            "Attempting to calibrate weights of a module not in calibration mode"
        )

    call_observer(module=module, base_name="weight")