Python bindings for general-sam
and some utilities.
flowchart LR
init((Ξ΅))
a((a))
b((b))
ab((ab))
bc(((bc)))
abc((abc))
abcb((abcb))
abcbc(((abcbc)))
init -- a --> a
init -- b --> b
a -- b --> ab
b -- c --> bc
init -- c --> bc
ab -- c --> abc
bc -- b --> abcb
abc -- b --> abcb
abcb -- c --> abcbc
The suffix automaton of abcbc.
pip install general-sam
from general_sam import GeneralSam
sam = GeneralSam.from_bytes(b"abcbc")
# "cbc" is a suffix of "abcbc"
state = sam.get_root_state()
state.feed_bytes(b"cbc")
assert state.is_accepting()
# "bcb" is not a suffix of "abcbc"
state = sam.get_root_state()
state.feed_bytes(b"bcb")
assert not state.is_accepting()
from general_sam import GeneralSam
sam = GeneralSam.from_chars("abcbc")
state = sam.get_root_state()
# "b" is not a suffix but at least a substring of "abcbc"
state.feed_chars("b")
assert not state.is_accepting()
# "bc" is a suffix of "abcbc"
state.feed_chars("c")
assert state.is_accepting()
# "bcbc" is a suffix of "abcbc"
state.feed_chars("bc")
assert state.is_accepting()
# "bcbcbc" is not a substring, much less a suffix of "abcbc"
state.feed_chars("bc")
assert not state.is_accepting() and state.is_nil()
from general_sam import GeneralSam, GeneralSamState, build_trie_from_chars
trie, _ = build_trie_from_chars(["hello", "Chielo"])
sam = GeneralSam.from_trie(trie)
def fetch_state(s: str) -> GeneralSamState:
state = sam.get_root_state()
state.feed_chars(s)
return state
assert fetch_state("lo").is_accepting()
assert fetch_state("ello").is_accepting()
assert fetch_state("elo").is_accepting()
state = fetch_state("el")
assert not state.is_accepting() and not state.is_nil()
state = fetch_state("bye")
assert not state.is_accepting() and state.is_nil()
from general_sam import CountInfo, VocabPrefixAutomaton
vocab = ["ζζ²", "θε¬ζζ²", "ζζΎζζ²", "ζθ―", "ζ₯ηζθ―"]
automaton = VocabPrefixAutomaton(vocab, bytes_or_chars="chars")
# NOTE: CountInfo instances are actually related to the sorted `vocab`:
_ = ["ζζΎζζ²", "ζ₯ηζθ―", "ζζ²", "ζθ―", "θε¬ζζ²"]
# Case 1:
# δΈθ΅· | θ | ε¬ | ζ
state = automaton.get_root_state()
# prepend 'ζ'
cnt_info = automaton.prepend_feed(state, "ζ")
assert cnt_info is not None and cnt_info == CountInfo(
str_cnt=2, tot_cnt_lower=2, tot_cnt_upper=4
)
# found 'ζζ²' at the index 0 and 'ζθ―' at the index 3 prefixed with 'ζ'
selected_idx = automaton.get_order_slice(cnt_info)
assert frozenset(selected_idx) == {0, 3}
selected_vocab = [vocab[i] for i in selected_idx]
assert frozenset(selected_vocab) == {"ζζ²", "ζθ―"}
# prepend ε¬
cnt_info = automaton.prepend_feed(state, "ε¬")
# found nothing prefixed with 'ε¬ζ'
assert cnt_info is None
assert not state.is_nil()
# prepend θ
cnt_info = automaton.prepend_feed(state, "θ")
assert cnt_info is not None and cnt_info == CountInfo(
str_cnt=1, tot_cnt_lower=4, tot_cnt_upper=5
)
# found 'θε¬ζζ²' at the index 1 prefixed with 'θε¬ζ'
selected_idx = automaton.get_order_slice(cnt_info)
assert frozenset(selected_idx) == {1}
selected_vocab = [vocab[i] for i in selected_idx]
assert frozenset(selected_vocab) == {"θε¬ζζ²"}
# prepend δΈθ΅·
assert not state.is_nil()
# found nothing prefixed with 'δΈθ΅·θε¬ζ'
cnt_info = automaton.prepend_feed(state, "δΈθ΅·")
assert state.is_nil()
# Case 2:
# ζ₯ | ζ₯η | ζθ―
state = automaton.get_root_state()
# prepend ζθ―
cnt_info = automaton.prepend_feed(state, "ζθ―")
assert cnt_info is not None and cnt_info == CountInfo(
str_cnt=1, tot_cnt_lower=3, tot_cnt_upper=4
)
# found 'ζθ―' at the index 3 prefixed with 'ζθ―'
selected_idx = automaton.get_order_slice(cnt_info)
assert frozenset(selected_idx) == {3}
selected_vocab = [vocab[i] for i in selected_idx]
assert frozenset(selected_vocab) == {"ζθ―"}
# prepend ζ₯η
cnt_info = automaton.prepend_feed(state, "ζ₯η")
assert cnt_info is not None and cnt_info == CountInfo(
str_cnt=1, tot_cnt_lower=1, tot_cnt_upper=2
)
# found 'ζ₯ηζθ―' at the index 4 prefixed with 'ζ₯ηζθ―'
selected_idx = automaton.get_order_slice(cnt_info)
assert frozenset(selected_idx) == {4}
selected_vocab = [vocab[i] for i in selected_idx]
assert frozenset(selected_vocab) == {"ζ₯ηζθ―"}
# prepend ζ₯
assert not state.is_nil()
# found nothing prefixed with 'ζ₯ζ₯ηζθ―'
cnt_info = automaton.prepend_feed(state, "ζ₯")
assert state.is_nil()
from general_sam import GeneralSam, GreedyTokenizer, build_trie_from_chars
vocab = ["a", "ab", "b", "bc", "c", "d", "e", "f", "cd", "abcde"]
trie, token_to_trie_node = build_trie_from_chars(vocab)
trie_node_to_token = [-1] * trie.num_of_nodes()
for i, j in enumerate(token_to_trie_node):
trie_node_to_token[j] = i
sam = GeneralSam.from_trie(trie)
tokenizer = GreedyTokenizer.from_sam_and_trie(sam, trie)
def tokenize(s: str):
return [(trie_node_to_token[i], j) for i, j in tokenizer.tokenize_str(s)]
assert tokenize("abcde") == [(9, 5)]
assert tokenize("abcdf") == [(1, 2), (8, 2), (7, 1)]
assert tokenize("abca") == [(1, 2), (4, 1), (0, 1)]
- Β© 2023 Chielo Newctle <ChieloNewctle@gmail.com>
- Β© 2023 ModelTC Team
This project is licensed under either of
at your option.
The SPDX license identifier for this project is MIT OR Apache-2.0
.