melib/line_break: fix panics from Unicode13 linebreak test cases
parent
83bee279e6
commit
10a3430233
|
@ -53,6 +53,7 @@ pub struct LineBreakCandidateIter<'a> {
|
||||||
reg_ind_streak: u32,
|
reg_ind_streak: u32,
|
||||||
/* Needed for break before and after opportunities */
|
/* Needed for break before and after opportunities */
|
||||||
break_now: bool,
|
break_now: bool,
|
||||||
|
last_break: usize,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'a> LineBreakCandidateIter<'a> {
|
impl<'a> LineBreakCandidateIter<'a> {
|
||||||
|
@ -63,6 +64,7 @@ impl<'a> LineBreakCandidateIter<'a> {
|
||||||
iter: UnicodeSegmentation::grapheme_indices(text, true).peekable(),
|
iter: UnicodeSegmentation::grapheme_indices(text, true).peekable(),
|
||||||
reg_ind_streak: 0,
|
reg_ind_streak: 0,
|
||||||
break_now: false,
|
break_now: false,
|
||||||
|
last_break: 0,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -118,6 +120,20 @@ macro_rules! next_grapheme_class {
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
trait EvenAfterSpaces {
|
||||||
|
fn even_after_spaces(&self) -> &Self;
|
||||||
|
}
|
||||||
|
|
||||||
|
impl EvenAfterSpaces for str {
|
||||||
|
fn even_after_spaces(&self) -> &Self {
|
||||||
|
let mut ret = self;
|
||||||
|
while !ret.is_empty() && get_class!(&ret) != SP {
|
||||||
|
ret = &ret[get_base_character!(ret).unwrap().len_utf8()..];
|
||||||
|
}
|
||||||
|
ret
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/// Returns positions where breaks can happen
|
/// Returns positions where breaks can happen
|
||||||
/// Examples:
|
/// Examples:
|
||||||
/// ```
|
/// ```
|
||||||
|
@ -134,14 +150,24 @@ impl<'a> Iterator for LineBreakCandidateIter<'a> {
|
||||||
type Item = (usize, LineBreakCandidate);
|
type Item = (usize, LineBreakCandidate);
|
||||||
fn next(&mut self) -> Option<Self::Item> {
|
fn next(&mut self) -> Option<Self::Item> {
|
||||||
loop {
|
loop {
|
||||||
|
macro_rules! set_last_break {
|
||||||
|
($last_break:expr, $pos:expr) => {
|
||||||
|
if $last_break == $pos {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
$last_break = $pos;
|
||||||
|
};
|
||||||
|
};
|
||||||
// After end of text, there are no breaks.
|
// After end of text, there are no breaks.
|
||||||
if self.pos >= self.text.len() {
|
if self.pos > self.text.len() {
|
||||||
return None;
|
return None;
|
||||||
}
|
}
|
||||||
// LB3 Always break at the end of text
|
// LB3 Always break at the end of text
|
||||||
if self.pos + 1 == self.text.len() {
|
if self.pos == self.text.len() {
|
||||||
|
let ret = self.pos;
|
||||||
self.pos += 1;
|
self.pos += 1;
|
||||||
return Some((self.pos, MandatoryBreak));
|
set_last_break!(self.last_break, ret);
|
||||||
|
return Some((ret, MandatoryBreak));
|
||||||
}
|
}
|
||||||
|
|
||||||
let LineBreakCandidateIter {
|
let LineBreakCandidateIter {
|
||||||
|
@ -149,6 +175,7 @@ impl<'a> Iterator for LineBreakCandidateIter<'a> {
|
||||||
ref text,
|
ref text,
|
||||||
ref mut reg_ind_streak,
|
ref mut reg_ind_streak,
|
||||||
ref mut break_now,
|
ref mut break_now,
|
||||||
|
ref mut last_break,
|
||||||
ref mut pos,
|
ref mut pos,
|
||||||
} = self;
|
} = self;
|
||||||
let (idx, mut grapheme) = iter.next().unwrap();
|
let (idx, mut grapheme) = iter.next().unwrap();
|
||||||
|
@ -156,12 +183,6 @@ impl<'a> Iterator for LineBreakCandidateIter<'a> {
|
||||||
|
|
||||||
debug_assert_eq!(idx, *pos);
|
debug_assert_eq!(idx, *pos);
|
||||||
|
|
||||||
// LB2 Never break at the start of text
|
|
||||||
if idx == 0 {
|
|
||||||
*pos += grapheme.len();
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
let class = get_class!(grapheme);
|
let class = get_class!(grapheme);
|
||||||
|
|
||||||
if class != RI {
|
if class != RI {
|
||||||
|
@ -184,12 +205,13 @@ impl<'a> Iterator for LineBreakCandidateIter<'a> {
|
||||||
// TODO: LB1
|
// TODO: LB1
|
||||||
|
|
||||||
/* Check if next character class allows breaks before it */
|
/* Check if next character class allows breaks before it */
|
||||||
let next_char: Option<&(usize, &str)> = iter.peek();
|
let mut next_char: Option<&(usize, &str)> = iter.peek();
|
||||||
|
|
||||||
match class {
|
match class {
|
||||||
BK => {
|
BK => {
|
||||||
// LB4 Always Break after hard line breaks.
|
// LB4 Always Break after hard line breaks.
|
||||||
*pos += grapheme.len();
|
*pos += grapheme.len();
|
||||||
|
set_last_break!(*last_break, *pos);
|
||||||
return Some((*pos, MandatoryBreak));
|
return Some((*pos, MandatoryBreak));
|
||||||
}
|
}
|
||||||
// LB5 Treat CR followed by LF, as well as CR, LF, and NL as hard line breaks
|
// LB5 Treat CR followed by LF, as well as CR, LF, and NL as hard line breaks
|
||||||
|
@ -197,10 +219,12 @@ impl<'a> Iterator for LineBreakCandidateIter<'a> {
|
||||||
*pos += grapheme.len();
|
*pos += grapheme.len();
|
||||||
assert!(Some(LF) == next_grapheme_class!(iter, grapheme));
|
assert!(Some(LF) == next_grapheme_class!(iter, grapheme));
|
||||||
*pos += grapheme.len();
|
*pos += grapheme.len();
|
||||||
|
set_last_break!(*last_break, *pos);
|
||||||
return Some((*pos, MandatoryBreak));
|
return Some((*pos, MandatoryBreak));
|
||||||
}
|
}
|
||||||
CR | LF | NL => {
|
CR | LF | NL => {
|
||||||
*pos += grapheme.len();
|
*pos += grapheme.len();
|
||||||
|
set_last_break!(*last_break, *pos);
|
||||||
return Some((*pos, MandatoryBreak));
|
return Some((*pos, MandatoryBreak));
|
||||||
}
|
}
|
||||||
_ => {}
|
_ => {}
|
||||||
|
@ -219,10 +243,21 @@ impl<'a> Iterator for LineBreakCandidateIter<'a> {
|
||||||
*pos += grapheme.len();
|
*pos += grapheme.len();
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
WJ => {
|
||||||
|
/*: LB11 Do not break before or after Word joiner and related characters.*/
|
||||||
|
*pos += grapheme.len();
|
||||||
|
continue;
|
||||||
|
}
|
||||||
_ if *break_now => {
|
_ if *break_now => {
|
||||||
*break_now = false;
|
*break_now = false;
|
||||||
let ret = *pos;
|
let ret = *pos;
|
||||||
*pos += grapheme.len();
|
*pos += grapheme.len();
|
||||||
|
// LB2 Never break at the start of text
|
||||||
|
if ret == 0 {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
set_last_break!(*last_break, ret);
|
||||||
return Some((ret, BreakAllowed));
|
return Some((ret, BreakAllowed));
|
||||||
}
|
}
|
||||||
_ => {}
|
_ => {}
|
||||||
|
@ -234,9 +269,13 @@ impl<'a> Iterator for LineBreakCandidateIter<'a> {
|
||||||
// spaces intervene
|
// spaces intervene
|
||||||
// ZW SP* ÷
|
// ZW SP* ÷
|
||||||
*pos += grapheme.len();
|
*pos += grapheme.len();
|
||||||
while Some(SP) == next_grapheme_class!(iter, grapheme) {
|
while next_grapheme_class!((next_char is SP)) {
|
||||||
|
let (_idx, grapheme) = iter.next().unwrap();
|
||||||
|
debug_assert_eq!(get_class!(grapheme), SP);
|
||||||
*pos += grapheme.len();
|
*pos += grapheme.len();
|
||||||
|
next_char = iter.peek();
|
||||||
}
|
}
|
||||||
|
set_last_break!(*last_break, *pos);
|
||||||
return Some((*pos, MandatoryBreak));
|
return Some((*pos, MandatoryBreak));
|
||||||
}
|
}
|
||||||
ZWJ => {
|
ZWJ => {
|
||||||
|
@ -292,8 +331,9 @@ impl<'a> Iterator for LineBreakCandidateIter<'a> {
|
||||||
|
|
||||||
match class {
|
match class {
|
||||||
/* LB13 Do not break before ‘]’ or ‘!’ or ‘;’ or ‘/’, even after spaces. */
|
/* LB13 Do not break before ‘]’ or ‘!’ or ‘;’ or ‘/’, even after spaces. */
|
||||||
SP if !text[idx..].trim_start().is_empty()
|
SP if !text[idx..].even_after_spaces().is_empty()
|
||||||
&& [CL, CP, EX, IS, SY].contains(&get_class!(text[idx..].trim_start())) =>
|
&& [CL, CP, EX, IS, SY]
|
||||||
|
.contains(&get_class!(text[idx..].even_after_spaces())) =>
|
||||||
{
|
{
|
||||||
*pos += grapheme.len();
|
*pos += grapheme.len();
|
||||||
while ![CL, CP, EX, IS, SY]
|
while ![CL, CP, EX, IS, SY]
|
||||||
|
@ -308,22 +348,27 @@ impl<'a> Iterator for LineBreakCandidateIter<'a> {
|
||||||
/* LB14 Do not break after ‘[’, even after spaces.
|
/* LB14 Do not break after ‘[’, even after spaces.
|
||||||
* OP SP* ×
|
* OP SP* ×
|
||||||
*/
|
*/
|
||||||
while let Some((idx, grapheme)) = self.iter.next() {
|
*pos += grapheme.len();
|
||||||
*pos = idx + grapheme.len();
|
while next_grapheme_class!((next_char is SP)) {
|
||||||
if !(get_class!(grapheme) == SP) {
|
let (_idx, grapheme) = iter.next().unwrap();
|
||||||
break;
|
debug_assert_eq!(get_class!(grapheme), SP);
|
||||||
}
|
*pos += grapheme.len();
|
||||||
|
next_char = iter.peek();
|
||||||
}
|
}
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
QU if get_class!(text[idx + grapheme.len()..].trim_start()) == OP => {
|
QU if !text[idx + grapheme.len()..].even_after_spaces().is_empty()
|
||||||
|
&& get_class!(text[idx + grapheme.len()..].even_after_spaces()) == OP =>
|
||||||
|
{
|
||||||
/* LB15 Do not break within ‘”[’, even with intervening spaces.
|
/* LB15 Do not break within ‘”[’, even with intervening spaces.
|
||||||
* QU SP* × OP */
|
* QU SP* × OP */
|
||||||
*pos += grapheme.len();
|
*pos += grapheme.len();
|
||||||
while Some(SP) == next_grapheme_class!(iter, grapheme) {
|
while next_grapheme_class!((next_char is SP)) {
|
||||||
|
let (_idx, grapheme) = iter.next().unwrap();
|
||||||
|
debug_assert_eq!(get_class!(grapheme), SP);
|
||||||
*pos += grapheme.len();
|
*pos += grapheme.len();
|
||||||
|
next_char = iter.peek();
|
||||||
}
|
}
|
||||||
*pos = idx;
|
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
QU => {
|
QU => {
|
||||||
|
@ -334,7 +379,10 @@ impl<'a> Iterator for LineBreakCandidateIter<'a> {
|
||||||
}
|
}
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
CL | CP if get_class!(text[idx + grapheme.len()..].trim_start()) == NS => {
|
CL | CP
|
||||||
|
if !text[idx + grapheme.len()..].even_after_spaces().is_empty()
|
||||||
|
&& get_class!(text[idx + grapheme.len()..].even_after_spaces()) == NS =>
|
||||||
|
{
|
||||||
/* LB16 Do not break between closing punctuation and a nonstarter (lb=NS), even with
|
/* LB16 Do not break between closing punctuation and a nonstarter (lb=NS), even with
|
||||||
* intervening spaces.
|
* intervening spaces.
|
||||||
* (CL | CP) SP* × NS */
|
* (CL | CP) SP* × NS */
|
||||||
|
@ -344,15 +392,18 @@ impl<'a> Iterator for LineBreakCandidateIter<'a> {
|
||||||
}
|
}
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
B2 if get_class!(text[idx..].trim_start()) == B2 => {
|
B2 if !text[idx + grapheme.len()..].even_after_spaces().is_empty()
|
||||||
|
&& get_class!(text[idx + grapheme.len()..].even_after_spaces()) == B2 =>
|
||||||
|
{
|
||||||
|
/* LB17 Do not break within ‘——’, even with intervening spaces.
|
||||||
|
* B2 SP* × B2*/
|
||||||
*pos += grapheme.len();
|
*pos += grapheme.len();
|
||||||
*break_now = true;
|
continue;
|
||||||
return Some((*pos, BreakAllowed));
|
|
||||||
}
|
}
|
||||||
SP => {
|
SP => {
|
||||||
/* LB18 Break after spaces. SP ÷ */
|
/* LB18 Break after spaces. SP ÷ */
|
||||||
// Space 0x20 is 1 byte long.
|
*pos += grapheme.len();
|
||||||
*pos += 1;
|
set_last_break!(*last_break, *pos);
|
||||||
return Some((*pos, BreakAllowed));
|
return Some((*pos, BreakAllowed));
|
||||||
}
|
}
|
||||||
_ => {}
|
_ => {}
|
||||||
|
@ -372,12 +423,19 @@ impl<'a> Iterator for LineBreakCandidateIter<'a> {
|
||||||
match class {
|
match class {
|
||||||
CB => {
|
CB => {
|
||||||
/* LB20 Break before and after unresolved CB. */
|
/* LB20 Break before and after unresolved CB. */
|
||||||
|
let ret = *pos;
|
||||||
*pos += grapheme.len();
|
*pos += grapheme.len();
|
||||||
return Some((*pos - 1, BreakAllowed));
|
*break_now = true;
|
||||||
|
// LB2 Never break at the start of text
|
||||||
|
if ret == 0 {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
set_last_break!(*last_break, ret);
|
||||||
|
return Some((ret, BreakAllowed));
|
||||||
}
|
}
|
||||||
/* LB21 Do not break before hyphen-minus, other hyphens, fixed-width spaces, small
|
/* LB21 Do not break before hyphen-minus, other hyphens, fixed-width spaces, small
|
||||||
* kana, and other non-starters, or after acute accents. × BA, × HY, × NS, BB × */
|
* kana, and other non-starters, or after acute accents. × BA, × HY, × NS, BB × */
|
||||||
BB => {
|
BB if !*break_now => {
|
||||||
*pos += grapheme.len();
|
*pos += grapheme.len();
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
@ -391,6 +449,7 @@ impl<'a> Iterator for LineBreakCandidateIter<'a> {
|
||||||
/* LB21 Do not break before hyphen-minus, other hyphens, fixed-width spaces, small
|
/* LB21 Do not break before hyphen-minus, other hyphens, fixed-width spaces, small
|
||||||
* kana, and other non-starters, or after acute accents. × BA, × HY, × NS, BB × */
|
* kana, and other non-starters, or after acute accents. × BA, × HY, × NS, BB × */
|
||||||
*pos += grapheme.len();
|
*pos += grapheme.len();
|
||||||
|
//*pos += next_grapheme.len();
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
_ => {}
|
_ => {}
|
||||||
|
@ -696,17 +755,49 @@ impl<'a> Iterator for LineBreakCandidateIter<'a> {
|
||||||
*reg_ind_streak += 1;
|
*reg_ind_streak += 1;
|
||||||
*pos += grapheme.len();
|
*pos += grapheme.len();
|
||||||
if *reg_ind_streak % 2 == 1 {
|
if *reg_ind_streak % 2 == 1 {
|
||||||
return Some((*pos - grapheme.len(), BreakAllowed));
|
let ret = *pos - grapheme.len();
|
||||||
|
// LB2 Never break at the start of text
|
||||||
|
if ret == 0 {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
set_last_break!(*last_break, ret);
|
||||||
|
return Some((ret, BreakAllowed));
|
||||||
}
|
}
|
||||||
self.iter.next();
|
self.iter.next();
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
_ if next_char.is_none() => {
|
CL | CP | IS | SY => {
|
||||||
return None;
|
*pos += grapheme.len();
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
BK | CR | LF | NL => {
|
||||||
|
*pos += grapheme.len();
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
SP | ZW => {
|
||||||
|
*pos += grapheme.len();
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
BA | HY | NS => {
|
||||||
|
*pos += grapheme.len();
|
||||||
|
continue;
|
||||||
}
|
}
|
||||||
_ => {
|
_ => {
|
||||||
|
/* LB31 Break everywhere else.
|
||||||
|
* ALL ÷
|
||||||
|
* ÷ ALL
|
||||||
|
*/
|
||||||
|
let ret = *pos;
|
||||||
|
// ALL ÷
|
||||||
|
*break_now = true;
|
||||||
*pos += grapheme.len();
|
*pos += grapheme.len();
|
||||||
return Some((*pos - grapheme.len(), BreakAllowed));
|
// LB2 Never break at the start of text
|
||||||
|
if ret == 0 {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
// ÷ ALL
|
||||||
|
set_last_break!(*last_break, ret);
|
||||||
|
return Some((ret, BreakAllowed));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue