Splitting a string according to a character separator
This problem was suggested by Georges-Axel Jaloyan (Amazon Web Services), during a talk at the first Dafny workshop (London, 2024).
Authors: Jean-Christophe Filliâtre
Topics: Strings
Tools: Why3
see also the index (by topic, by tool, by reference, by year)
Splitting a string according to a character separator
This problem was suggested by Georges-Axel Jaloyan (Amazon Web Services), during a talk at the first Dafny workshop (London, 2024).
The problem is stated as follows: split a string s according to a
character separator sep, as a sequence of substrings [s0;s1;...sn]
where
- the separator does not appear in any si;
- the concatenation s0+sep+s1+sep+...+sep+sn is equal to s.
Examples. Assuming that the separator character is '.', here are some expected input/output:
+---------+-------------------+ | input | output | +---------+-------------------+ | "" | [""] | | "." | ["", ""] | | ".." | ["", "", ""] | | "abc" | ["abc"] | | "abc." | ["abc", ""] | | ".abc" | ["", "abc"] | | "a.bc" | ["a", "bc"] | | "a..bc" | ["a", "", "bc"] | +---------+-------------------+
Note that the output sequence is never empty.
Additionnally, we possibly set a limit to the total number of
substrings in the output. We do so with an integer parameter limit.
If limit = -1, there is no limit. Otherwise, limit >= 1.
When a limit is set, the last substring in the output may contain the
separator.
Examples. Assuming that the separator character is '.' and the limit is 2, here are some expected input/output:
+-----------+----------------+ | input | output | +-----------+----------------+ | "a." | ["a", ""] | | "b.." | ["b", "."] | | "b..." | ["b", ".."] | | "ba.b.b." | ["ba", "b.b."] | +-----------+----------------+
Author: Jean-Christophe Filliâtre (CNRS)
module SplitString use int.Int use seq.Seq use seq.FreeMonoid use seq.Mem
Characters and strings
type char val (=) (x y: char) : bool ensures { result <-> x=y } type string_ = seq char
Specification
let rec function concat (ss: seq string_) (sep: char) : string_ requires { length ss >= 1 } variant { length ss } = if length ss = 1 then ss[0] else concat ss[0..length ss - 1] sep ++ (cons sep ss[length ss - 1])
concat [s0;s1;...;sn] sep is s0+sep+s1+sep+...+sep+sn
for a non-empty sequence of strings
predicate notin (sep: char) (s: string_) = not (mem sep s)
a mere shortcut, for convenience
Code
let ([]) (s: string_) (i: int) : char requires { [@expl:index in string bounds] 0 <= i < length s } = get s i let split_string (s: string_) (sep: char) (limit: int) : (ss: seq string_) requires { limit = -1 || limit >= 1 } ensures { length ss >= 1 } ensures { limit = -1 || length ss <= limit } ensures { forall j. 0 <= j < length ss - 1 -> notin sep ss[j] } ensures { length ss = limit || notin sep ss[length ss - 1] } ensures { concat ss sep == s } = if limit = 1 then return (singleton s); let ref ss = empty in let ref i = 0 in let ref b = 0 in while i < length s do invariant { 0 <= b <= i <= length s } invariant { forall j. 0 <= j < length ss -> notin sep ss[j] } invariant { forall j. b <= j < i -> s[j] <> sep } invariant { limit = -1 || length ss < limit-1 } invariant { concat (snoc ss s[b..]) sep == s } variant { length s - i } if s[i] = sep then ( ss <- snoc ss s[b..i]; assert { s[b..] == s[b..i] ++ cons sep s[i+1..] }; if length ss = limit-1 then return snoc ss s[i+1..]; b <- i+1; ); i <- i+1 done; return snoc ss s[b..] end module SplitStringOCaml use int.Int use seq.Seq use seq.FreeMonoid use seq.Mem use mach.int.Int63 use mach.peano.Peano use mach.peano.Int63 use queue.Queue as Q
Characters and strings
type char val (=) (x y: char) : bool ensures { result <-> x=y } type string_ = private { ghost contents_: seq char; } meta coercion function contents_
Specification
let rec ghost function concat (ss: seq string_) (sep: char) : seq char requires { length ss >= 1 } variant { length ss } = if length ss = 1 then ss[0].contents_ else concat ss[0..length ss - 1] sep ++ (cons sep ss[length ss - 1].contents_)
concat [s0;s1;...;sn] sep is s0+sep+s1+sep+...+sep+sn
for a non-empty sequence of strings
predicate notin (sep: char) (s: string_) = not (mem sep s)
a mere shortcut, for convenience
Code
val length (s: string_) : int63 ensures { result = length s } val sub (s: string_) (i j: int63) : string_ requires { [@expl:index in string bounds] 0 <= i <= j <= length s } ensures { result = s[i..j] } val ([]) (s: string_) (i: int63) : char requires { [@expl:index in string bounds] 0 <= i < length s } ensures { result = s[i] } let partial split_string (s: string_) (sep: char) (limit: int63) : (ss: Q.t string_) requires { limit = -1 || limit >= 1 } ensures { length ss >= 1 } ensures { limit = -1 || length ss <= limit } ensures { forall j. 0 <= j < length ss - 1 -> notin sep ss[j] } ensures { length ss = limit || notin sep ss[length ss - 1] } ensures { concat ss sep == s } = let ref ss = Q.create () in if limit = (1:int63) then (Q.push s ss; return ss); let ref i = 0: int63 in let ref b = 0: int63 in let ghost ref suffix = s in while i < length s do invariant { 0 <= b <= i <= length s } invariant { forall j. 0 <= j < length ss -> notin sep ss[j] } invariant { forall j. b <= j < i -> s[j] <> sep } invariant { limit = -1 || length ss < limit-1 } invariant { suffix = s[b..] } invariant { concat (snoc ss suffix) sep == s } variant { length s - i } if s[i] = sep then ( Q.push (sub s b i) ss; assert { s[b..] == s[b..i] ++ cons sep s[i+1..] }; if to_int63 (Q.length ss) = limit - (1:int63) then (Q.push (sub s (i+1) (length s)) ss; return ss); b <- i+1; suffix <- sub s b (length s) ); i <- i+1 done; Q.push (sub s b (length s)) ss; return ss end
download ZIP archive
Why3 Proof Results for Project "split_string"
Theory "split_string.SplitString": fully verified
| Obligations | Alt-Ergo 2.4.0 | CVC4 1.8 | CVC5 1.0.2 | Z3 4.8.10 | |
| VC for concat | --- | --- | --- | 0.04 | |
| VC for mixfix [] | --- | --- | --- | 0.03 | |
| VC for split_string | --- | --- | --- | --- | |
| split_vc | |||||
| postcondition | --- | --- | --- | 0.02 | |
| postcondition | --- | --- | --- | 0.03 | |
| postcondition | --- | --- | 0.15 | --- | |
| postcondition | --- | --- | --- | 0.02 | |
| postcondition | --- | --- | --- | 0.06 | |
| loop invariant init | --- | --- | --- | 0.05 | |
| loop invariant init | --- | --- | --- | 0.04 | |
| loop invariant init | --- | --- | --- | 0.03 | |
| loop invariant init | --- | --- | --- | 0.05 | |
| loop invariant init | --- | --- | --- | 0.50 | |
| index in string bounds | --- | --- | --- | 0.02 | |
| precondition | --- | --- | --- | 0.02 | |
| assertion | 0.28 | --- | --- | --- | |
| precondition | --- | --- | --- | 0.05 | |
| postcondition | --- | --- | --- | 0.06 | |
| postcondition | --- | --- | --- | 0.04 | |
| postcondition | 0.27 | --- | --- | --- | |
| postcondition | 0.02 | --- | --- | --- | |
| postcondition | --- | 3.80 | --- | --- | |
| loop variant decrease | --- | --- | --- | 0.03 | |
| loop invariant preservation | --- | --- | --- | 0.06 | |
| loop invariant preservation | 0.40 | --- | --- | --- | |
| loop invariant preservation | --- | --- | --- | 0.03 | |
| loop invariant preservation | --- | --- | --- | 0.06 | |
| loop invariant preservation | --- | 3.77 | --- | --- | |
| loop variant decrease | --- | --- | --- | 0.04 | |
| loop invariant preservation | --- | --- | --- | 0.05 | |
| loop invariant preservation | 0.07 | --- | --- | 0.06 | |
| loop invariant preservation | --- | --- | --- | 0.06 | |
| loop invariant preservation | --- | --- | --- | 0.03 | |
| loop invariant preservation | --- | --- | --- | 0.02 | |
| precondition | --- | --- | --- | 0.05 | |
| postcondition | --- | --- | --- | 0.05 | |
| postcondition | --- | --- | --- | 0.05 | |
| postcondition | --- | --- | --- | 0.06 | |
| postcondition | 0.07 | --- | --- | --- | |
| postcondition | --- | --- | --- | 0.05 | |
Theory "split_string.SplitStringOCaml": fully verified
| Obligations | Alt-Ergo 2.4.0 | CVC4 1.8 | CVC5 1.0.2 | Z3 4.8.10 | |||
| VC for concat | --- | --- | --- | 0.08 | |||
| VC for split_string | --- | --- | --- | --- | |||
| split_vc | |||||||
| postcondition | --- | --- | --- | 0.03 | |||
| postcondition | --- | --- | --- | 0.02 | |||
| postcondition | --- | --- | --- | 0.02 | |||
| postcondition | --- | --- | --- | 0.02 | |||
| postcondition | --- | --- | --- | 0.05 | |||
| loop invariant init | --- | --- | --- | 0.05 | |||
| loop invariant init | --- | --- | --- | 0.03 | |||
| loop invariant init | --- | --- | --- | 0.03 | |||
| loop invariant init | --- | --- | --- | 0.06 | |||
| loop invariant init | --- | --- | --- | 0.07 | |||
| loop invariant init | --- | --- | --- | 0.61 | |||
| index in string bounds | --- | --- | --- | 0.07 | |||
| index in string bounds | --- | --- | --- | 0.04 | |||
| assertion | 14.79 | --- | --- | --- | |||
| integer overflow | --- | --- | --- | 0.07 | |||
| integer overflow | --- | --- | --- | 0.07 | |||
| index in string bounds | --- | --- | --- | 0.05 | |||
| postcondition | --- | --- | --- | 0.07 | |||
| postcondition | --- | --- | --- | 0.04 | |||
| postcondition | --- | --- | --- | --- | |||
| assert (j < length ss1.seq) | |||||||
| asserted formula | --- | --- | --- | 0.07 | |||
| postcondition | --- | --- | --- | --- | |||
| case (j = length ss2.seq) | |||||||
| true case (postcondition) | --- | --- | --- | 0.20 | |||
| false case (postcondition) | 0.24 | --- | --- | --- | |||
| postcondition | --- | --- | --- | 0.06 | |||
| postcondition | --- | 5.21 | --- | --- | |||
| integer overflow | --- | --- | --- | 0.06 | |||
| index in string bounds | --- | --- | --- | 0.06 | |||
| integer overflow | --- | --- | --- | 0.07 | |||
| loop variant decrease | --- | --- | --- | 0.05 | |||
| loop invariant preservation | --- | --- | --- | 0.06 | |||
| loop invariant preservation | --- | --- | --- | --- | |||
| split_vc | |||||||
| loop invariant preservation | 2.07 | --- | --- | --- | |||
| loop invariant preservation | --- | --- | 0.19 | --- | |||
| loop invariant preservation | --- | --- | --- | 0.03 | |||
| loop invariant preservation | --- | --- | 0.24 | --- | |||
| loop invariant preservation | --- | --- | --- | --- | |||
| split_vc | |||||||
| loop invariant preservation | --- | --- | --- | --- | |||
| assert (concat (snoc (ss.seq) suffix) sep == concat ss.seq sep ++ cons sep suffix) | |||||||
| asserted formula | --- | --- | --- | 0.21 | |||
| loop invariant preservation | --- | --- | 33.85 | --- | |||
| integer overflow | --- | --- | --- | 0.07 | |||
| loop variant decrease | --- | --- | --- | 0.04 | |||
| loop invariant preservation | --- | --- | --- | 0.05 | |||
| loop invariant preservation | --- | --- | --- | --- | |||
| split_vc | |||||||
| loop invariant preservation | 0.05 | --- | --- | --- | |||
| loop invariant preservation | --- | --- | 0.25 | --- | |||
| loop invariant preservation | --- | --- | --- | 0.06 | |||
| loop invariant preservation | --- | --- | --- | 0.03 | |||
| loop invariant preservation | --- | --- | --- | --- | |||
| split_vc | |||||||
| loop invariant preservation | --- | --- | 0.20 | --- | |||
| index in string bounds | --- | --- | --- | 0.07 | |||
| postcondition | --- | --- | --- | 0.06 | |||
| postcondition | --- | --- | --- | 0.05 | |||
| postcondition | 0.14 | --- | --- | --- | |||
| postcondition | 0.13 | --- | --- | --- | |||
| postcondition | --- | 0.44 | --- | --- | |||