Skip to content

llmcompressor.entrypoints.model_free.process

Functions:

process_file

process_file(
    file_path: str | PathLike,
    save_path: str | PathLike,
    scheme: QuantizationScheme,
    ignore: Iterable[str],
    device: str | device,
) -> tuple[int, dict[str, str]]

Quantize and compress tensors in a given safetensors file

Parameters:

  • file_path

    (str | PathLike) –

    safetensors file to process

  • save_path

    (str | PathLike) –

    save path of file with quantized weights

  • scheme

    (QuantizationScheme) –

    quantization scheme to apply to tensors

  • ignore

    (Iterable[str]) –

    modules to ignore. Modules ending with "norm" are automatically ignored

  • device

    (str | device) –

    device used to quantize and compress weights

Source code in llmcompressor/entrypoints/model_free/process.py
def process_file(
    file_path: str | os.PathLike,
    save_path: str | os.PathLike,
    scheme: QuantizationScheme,
    ignore: Iterable[str],
    device: str | torch.device,
) -> tuple[int, dict[str, str]]:
    """
    Quantize and compress tensors in a given safetensors file

    :param file_path: safetensors file to process
    :param save_path: save path of file with quantized weights
    :param scheme: quantization scheme to apply to tensors
    :param ignore: modules to ignore. Modules ending with "norm" are automatically
        ignored
    :param device: device used to quantize and compress weights
    """
    assert not is_microscale_scheme(scheme), "Use `_process_file_microscale_scheme`"
    tensors = load_file(file_path)

    for module_name, name in iter_quantizable_tensors(tensors, ignore):
        validate_weight_for_quantization(tensors[name], scheme, name)

        # 1. initialize module with qparams (on device)
        module = initialize_quantized_linear(tensors[name], scheme, device)

        # 2. calibrate weight qparams
        calibrate_scale_zp(module)

        # 3. compress module using qparams
        compress_module(module)

        # 4. save compressed data (on cpu)
        del tensors[name]
        prefix = module_name + "."
        for key, value in module.state_dict(prefix=prefix).items():
            tensors[key] = value.to("cpu")

    save_file(tensors, save_path)
    total_size = sum(tensor.nbytes for tensor in tensors.values())
    weight_map = {key: os.path.basename(save_path) for key in tensors.keys()}
    return total_size, weight_map

process_file_microscale_scheme

process_file_microscale_scheme(
    file_path: str | PathLike,
    save_path: str | PathLike,
    scheme: QuantizationScheme,
    ignore: Iterable[str],
    device: str | device,
) -> tuple[int, dict[str, str]]

Quantize and compress tensors in a given safetensors file

Parameters:

  • file_path

    (str | PathLike) –

    safetensors file to process

  • save_path

    (str | PathLike) –

    save path of file with quantized weights

  • scheme

    (QuantizationScheme) –

    quantization scheme to apply to tensors

  • ignore

    (Iterable[str]) –

    modules to ignore. Modules ending with "norm" are automatically ignored

  • device

    (str | device) –

    device used to quantize and compress weights

Source code in llmcompressor/entrypoints/model_free/process.py
def process_file_microscale_scheme(
    file_path: str | os.PathLike,
    save_path: str | os.PathLike,
    scheme: QuantizationScheme,
    ignore: Iterable[str],
    device: str | torch.device,
) -> tuple[int, dict[str, str]]:
    """
    Quantize and compress tensors in a given safetensors file

    :param file_path: safetensors file to process
    :param save_path: save path of file with quantized weights
    :param scheme: quantization scheme to apply to tensors
    :param ignore: modules to ignore. Modules ending with "norm" are automatically
        ignored
    :param device: device used to quantize and compress weights
    """
    assert is_microscale_scheme(scheme), "Use `_process_file` for non-microscale scheme"
    tensors = load_file(file_path)
    fused_sets, unmatched_sets = get_fused_names(tensors)
    assert len(unmatched_sets) <= 0  # should be caught by `validate_safetensors_index`

    fused_name_to_fused_index: dict[str, int]  # fused_name -> fused_index
    fused_modules: dict[int, dict[str, Module]]  # fused_index -> named_modules

    fused_name_to_fused_index = {
        name: index
        for index, matched_set in enumerate(fused_sets)
        for name in matched_set.values()
    }
    fused_modules = defaultdict(dict)

    for module_name, name in iter_quantizable_tensors(tensors, ignore):
        validate_weight_for_quantization(tensors[name], scheme, name)

        # 1. initialize module with qparams (on device)
        module = initialize_quantized_linear(tensors[name], scheme, device)

        # 2. calibrate weight qparams. Delay scale/zp calibration for fused modules
        calibrate_global_scale(module)
        if name in fused_name_to_fused_index:
            fused_index = fused_name_to_fused_index[name]
            fused_modules[fused_index][name] = module
            continue

        calibrate_scale_zp(module)

        # 3. compress module using qparams
        compress_module(module)

        # 4. save compressed data (on cpu)
        del tensors[name]
        prefix = module_name + "."
        for key, value in module.state_dict(prefix=prefix).items():
            tensors[key] = value.to("cpu")

    # compress and save miscroscale fused modules
    for named_modules in fused_modules.values():
        # 2.1. fuse global scales
        global_scales = [m.weight_global_scale for m in named_modules.values()]
        fused_global_scale = torch.min(torch.cat(global_scales, dim=0))

        for name, module in named_modules.items():
            module_name, _ = name.rsplit(".", 1)
            module.weight_global_scale.data.copy_(fused_global_scale)

            # 2.2. finish calibration with fused global scales
            calibrate_scale_zp(module)

            # 3. compress module using miscroscale qparams
            compress_module(module)

            # 4. save compressed data (on cpu)
            del tensors[name]
            prefix = module_name + "."
            for key, value in module.state_dict(prefix=prefix).items():
                tensors[key] = value.to("cpu")

    save_file(tensors, save_path)
    total_size = sum(tensor.nbytes for tensor in tensors.values())
    weight_map = {key: os.path.basename(save_path) for key in tensors.keys()}
    return total_size, weight_map

validate_file

validate_file(
    file_path: str | PathLike,
    save_path: str | PathLike,
    scheme: QuantizationScheme,
    ignore: Iterable[str],
    device: str | device,
)

Validate that each quantizable tensor in a safetensors file can be quantized.

Parameters:

  • file_path

    (str | PathLike) –

    safetensors file to validate

  • scheme

    (QuantizationScheme) –

    quantization scheme to apply to tensors

  • ignore

    (Iterable[str]) –

    modules to ignore. Modules ending with "norm" are automatically ignored

Source code in llmcompressor/entrypoints/model_free/process.py
def validate_file(
    file_path: str | os.PathLike,
    save_path: str | os.PathLike,
    scheme: QuantizationScheme,
    ignore: Iterable[str],
    device: str | torch.device,
):
    """
    Validate that each quantizable tensor in a safetensors file can be quantized.

    :param file_path: safetensors file to validate
    :param scheme: quantization scheme to apply to tensors
    :param ignore: modules to ignore. Modules ending with "norm" are automatically
        ignored
    """
    tensors = load_file(file_path)

    for _, name in iter_quantizable_tensors(tensors, ignore):
        validate_weight_for_quantization(tensors[name], scheme, name)