Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

MII v0.1.0 release #252

Merged
merged 12 commits into from
Nov 3, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ repos:
rev: 4.0.1
hooks:
- id: flake8
args: ['--ignore=E,F403,F405,F541,F841,W', '--select=E9,F,W6', '--per-file-ignores=__init__.py:F401,mii/grpc_related/proto/modelresponse_pb2.py:F821,F401']
args: ['--ignore=E,F403,F405,F541,F841,W', '--select=E9,F,W6', '--per-file-ignores=__init__.py:F401,mii/grpc_related/proto/modelresponse_pb2.py:F821,F401,mii/legacy/grpc_related/proto/legacymodelresponse_pb2.py:F821,F401']

- repo: local
hooks:
Expand Down
359 changes: 128 additions & 231 deletions README.md

Large diffs are not rendered by default.

Binary file added docs/images/fast-gen-overview.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added docs/images/fastgen-arch-dark.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added docs/images/fastgen-arch-light.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added docs/images/fastgen-hero-dark.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added docs/images/fastgen-hero-light.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added docs/images/fastgen-hero.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added docs/images/fastgen-overview-dark.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added docs/images/fastgen-overview-light.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added docs/images/mii-arch-dark.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added docs/images/mii-arch-light.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
2 changes: 2 additions & 0 deletions examples/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
# MII Examples
Please see [DeepSpeedExamples](https://github.com/microsoft/DeepSpeedExamples/tree/master/inference/mii) for a few examples on using MII.
22 changes: 11 additions & 11 deletions mii/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,19 +2,19 @@
# SPDX-License-Identifier: Apache-2.0

# DeepSpeed Team
import grpc
from .server import MIIServer
from .client import MIIClient, mii_query_handle
from .deployment import deploy
from .terminate import terminate
from .constants import DeploymentType, TaskType
from .aml_related.utils import aml_output_path
from .config import MIIConfig, ModelConfig
from .utils import get_supported_models
from .grpc_related.proto import modelresponse_pb2_grpc
try:
import grpc
from .pipeline import pipeline
from .server import serve
from .client import client
except ImportError as e:
print("Warning: DeepSpeed-FastGen could not be imported:")
print(e)
pass

from .legacy import MIIServer, MIIClient, mii_query_handle, deploy, terminate, DeploymentType, TaskType, aml_output_path, MIIConfig, ModelConfig, get_supported_models

__version__ = "0.0.0"
non_persistent_models = {}
try:
from .version import __version__
except ImportError:
Expand Down
6 changes: 6 additions & 0 deletions mii/batching/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
# Copyright (c) Microsoft Corporation.
# SPDX-License-Identifier: Apache-2.0

# DeepSpeed Team

from .ragged_batching import MIIAsyncPipeline, MIIPipeline
111 changes: 111 additions & 0 deletions mii/batching/generation/logit_processors.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
# Copyright (c) Microsoft Corporation.
# SPDX-License-Identifier: Apache-2.0

# DeepSpeed Team
import abc
from typing import List, Optional

import torch
import torch.nn.functional as F

FLOAT_PAD = -float("inf")


class BaseLogitProcessor(abc.ABC):
def __call__(self, logits: torch.Tensor) -> torch.Tensor:
return self.forward(logits)

@abc.abstractmethod
def forward(self, logits: torch.Tensor) -> torch.Tensor:
...

def get_key(self) -> str:
return self.__class__.__name__


class TopKLogitProcessor(BaseLogitProcessor):
def __init__(self, top_k: int) -> None:
self.top_k = top_k

def forward(self, logits: torch.Tensor) -> torch.Tensor:
# Remove all tokens with a probability less than the
# last token of the top-k
indices_to_remove = logits < torch.topk(logits, self.top_k)[0][..., -1, None]
logits[indices_to_remove] = FLOAT_PAD
return logits

def get_key(self) -> str:
return super().get_key() + f"_top_k={self.top_k}"


class TopPLogitProcessor(BaseLogitProcessor):
def __init__(self, top_p: float) -> None:
assert 0.0 <= top_p <= 1.0
self.top_p = top_p

def forward(self, logits: torch.Tensor) -> torch.Tensor:
# convert to 1D
sorted_logits, sorted_indices = torch.sort(logits, descending=True, dim=-1)
cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)

# Remove tokens with cumulative probability above the threshold
sorted_indices_to_remove = cumulative_probs > self.top_p
# Shift the indices to the right to keep also the first token
# above the threshold
sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
sorted_indices_to_remove[..., 0] = 0
for i in range(sorted_indices.size(0)):
indices_to_remove = sorted_indices[i][sorted_indices_to_remove[i]]
logits[i][indices_to_remove] = FLOAT_PAD
return logits

def get_key(self) -> str:
return super().get_key() + f"_top_p={self.top_p}"


class TemperatureLogitProcessor(BaseLogitProcessor):
def __init__(self, temperature: float) -> None:
self.temperature = temperature
assert self.temperature > 0.0

def forward(self, logits: torch.Tensor) -> torch.Tensor:
return logits / self.temperature

def get_key(self) -> str:
return super().get_key() + f"_temperature={self.temperature}"


class PipelineLogitProcessor(BaseLogitProcessor):
def __init__(self, pipeline: List[BaseLogitProcessor]) -> None:
assert all(isinstance(step, BaseLogitProcessor) for step in pipeline)
self.pipeline = pipeline

def forward(self, logits: torch.Tensor) -> torch.Tensor:
for step in self.pipeline:
logits = step(logits)
return logits

def get_key(self) -> str:
return super().get_key(
) + f"_{'_'.join(step.get_key() for step in self.pipeline)}"


class NucleusSamplingLogitProcessor(BaseLogitProcessor):
def __init__(self,
top_k: Optional[int] = None,
top_p: Optional[float] = None) -> None:
assert top_k is not None or top_p is not None
if top_k is None:
self._processor = TopPLogitProcessor(top_p)
elif top_p is None:
self._processor = TopKLogitProcessor(top_k)
else:
self._processor = PipelineLogitProcessor(
[TopKLogitProcessor(top_k),
TopPLogitProcessor(top_p)])

def forward(self, logits: torch.Tensor) -> torch.Tensor:
return self._processor(logits)

def get_key(self) -> str:
return super().get_key() + f"_{self._processor.get_key()}"
57 changes: 57 additions & 0 deletions mii/batching/generation/samplers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
# Copyright (c) Microsoft Corporation.
# SPDX-License-Identifier: Apache-2.0

# DeepSpeed Team
import abc
from typing import Tuple

import torch
from torch.distributions import Categorical


class BaseGenerationSampler(abc.ABC):
@abc.abstractmethod
def __call__(
self,
logits: torch.Tensor,
) -> Tuple[torch.LongTensor,
torch.Tensor]:
"""
Given the logits, return the next token to add to the sequence, as well
as the log probability of the token

Args:
logits (torch.Tensor):
The logits from the model. Shape: (batch_size, vocab_size)

Returns:
Tuple[torch.LongTensor, torch.Tensor]:
The next token to add to the sequence, and the log probability
of the token. Shape: (batch_size,) and (batch_size,)
"""
...

def get_key(self) -> str:
return self.__class__.__name__


class LogitsSampler(BaseGenerationSampler):
def __call__(
self,
logits: torch.Tensor,
) -> Tuple[torch.LongTensor,
torch.Tensor]:
logits = logits.float()
sampler = Categorical(logits=logits)
next_tokens = sampler.sample()
logprobs = sampler.log_prob(next_tokens)
return next_tokens, logprobs


class GreedySampler(BaseGenerationSampler):
def __call__(self, logits: torch.Tensor) -> Tuple[torch.LongTensor, torch.Tensor]:
logits = logits.float()
sampler = Categorical(logits=logits)
next_tokens = logits.argmax(dim=-1)
logprobs = sampler.log_prob(next_tokens)
return next_tokens, logprobs
97 changes: 97 additions & 0 deletions mii/batching/generation/stop_criterion.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
# Copyright (c) Microsoft Corporation.
# SPDX-License-Identifier: Apache-2.0

# DeepSpeed Team
import abc
from typing import List, Union

import torch

# from megatron import get_tokenizer
# from megatron.tokenizer.tokenizer import AbstractTokenizer


class BaseGenerationStopCriterion(abc.ABC):
def __init__(self, tokenizer):
self.tokenizer = tokenizer

def __call__(self, tokens: torch.LongTensor) -> torch.BoolTensor:
return self.forward(tokens)

@abc.abstractmethod
def forward(self, tokens: torch.LongTensor) -> torch.BoolTensor:
...

def get_key(self) -> str:
return self.__class__.__name__


class TokenStopCriterion(BaseGenerationStopCriterion):
def __init__(self, token: Union[str, int], tokenizer) -> None:
super().__init__(tokenizer=tokenizer)
if isinstance(token, str):
token_id = self.tokenizer.tokenize(token)[0]
else:
token_id = token
self.stop_token_id = token_id

def forward(self, tokens: torch.LongTensor) -> torch.BoolTensor:
retval = torch.zeros_like(tokens, dtype=torch.bool)
retval |= tokens == self.stop_token_id
return retval

def get_key(self) -> str:
return self.__class__.__name__ + f"_token_id={self.stop_token_id}"


class EosGenerationStopCriterion(BaseGenerationStopCriterion):
def __init__(self, tokenizer):
super().__init__(tokenizer=tokenizer)
if hasattr(self.tokenizer, "eod"):
self.eos_id = self.tokenizer.eod
elif hasattr(self.tokenizer, "eos_token_id"):
self.eos_id = self.tokenizer.eos_token_id
elif hasattr(self.tokenizer, "eos_token"):
self.eos_id = self.tokenizer.eos_token
else:
raise ValueError(
"Tokenizer must have either an `eod` or `eos_token` attribute.")

def forward(self, tokens: torch.LongTensor) -> torch.BoolTensor:
return tokens == self.eos_id


class NewLineDelimitedStopCriterion(BaseGenerationStopCriterion):
def __init__(self, tokenizer):
super().__init__(tokenizer=tokenizer)
self.stop_token_ids = list(
set([self.tokenizer.tokenize(x)[0] for x in ["\n",
"\r\n",
"\n\n",
".\n\n"]]))

def forward(self, tokens: torch.LongTensor) -> torch.BoolTensor:
retval = torch.zeros_like(tokens, dtype=torch.bool)
for stop_token_id in self.stop_token_ids:
retval |= tokens == stop_token_id
return retval


class PipelinedCriterion(BaseGenerationStopCriterion):
def __init__(
self,
criteria: List[BaseGenerationStopCriterion],
tokenizer,
):
super().__init__(tokenizer=tokenizer)
self.criteria = criteria

def forward(self, tokens: torch.LongTensor) -> torch.BoolTensor:
retval = torch.zeros_like(tokens, dtype=torch.bool)
for criterion in self.criteria:
retval |= criterion(tokens)
return retval

def get_key(self) -> str:
return super().get_key(
) + f"_{'_'.join(criterion.get_key() for criterion in self.criteria)}"
Loading