Skip to content

tokencategorical

TokenCategorical

Bases: Distribution

Source code in hfppl/distributions/tokencategorical.py
class TokenCategorical(Distribution):

    def __init__(self, lm, logits): 
        """Create a Categorical distribution whose values are Tokens, not integers. 
        Given a language model `lm` and an array of unnormalized log probabilities (of length `len(lm.vocab)`), 
        uses softmax to normalize them and samples a Token from the resulting categorical.

        Args:
            lm (hfppl.llms.CachedCausalLM): the language model whose vocabulary is to be generated from.
            logits (np.array): a numpy array of unnormalized log probabilities.
        """
        self.lm        = lm
        self.log_probs = log_softmax(logits)
        if self.lm.tokenizer.vocab_size != len(logits):
            raise RuntimeError(f"TokenCategorical: vocab size is {self.lm.tokenizer.vocab_size} but provided {len(logits)} logits.")

    async def sample(self):
        n = np.random.choice(len(self.log_probs), p=(np.exp(self.log_probs)))
        return Token(self.lm, n, self.lm.tokenizer.convert_ids_to_tokens(n)), self.log_probs[n]

    async def log_prob(self, value):
        return self.log_probs[value.token_id]

    async def argmax(self, idx):
        tok = torch.argsort(self.log_probs)[-idx]
        return Token(self.lm, tok, self.lm.tokenizer.convert_ids_to_tokens(tok)), self.log_probs[tok]

__init__(lm, logits)

Create a Categorical distribution whose values are Tokens, not integers. Given a language model lm and an array of unnormalized log probabilities (of length len(lm.vocab)), uses softmax to normalize them and samples a Token from the resulting categorical.

Parameters:

Name Type Description Default
lm CachedCausalLM

the language model whose vocabulary is to be generated from.

required
logits array

a numpy array of unnormalized log probabilities.

required
Source code in hfppl/distributions/tokencategorical.py
def __init__(self, lm, logits): 
    """Create a Categorical distribution whose values are Tokens, not integers. 
    Given a language model `lm` and an array of unnormalized log probabilities (of length `len(lm.vocab)`), 
    uses softmax to normalize them and samples a Token from the resulting categorical.

    Args:
        lm (hfppl.llms.CachedCausalLM): the language model whose vocabulary is to be generated from.
        logits (np.array): a numpy array of unnormalized log probabilities.
    """
    self.lm        = lm
    self.log_probs = log_softmax(logits)
    if self.lm.tokenizer.vocab_size != len(logits):
        raise RuntimeError(f"TokenCategorical: vocab size is {self.lm.tokenizer.vocab_size} but provided {len(logits)} logits.")