Browse Source

melib/line_break: fix panics from Unicode13 linebreak test cases

memfd
Manos Pitsidianakis 1 year ago
parent
commit
10a3430233
Signed by untrusted user: epilys GPG Key ID: 73627C2F690DF710
  1. 157
      melib/src/text_processing/line_break.rs

157
melib/src/text_processing/line_break.rs

@ -53,6 +53,7 @@ pub struct LineBreakCandidateIter<'a> {
reg_ind_streak: u32,
/* Needed for break before and after opportunities */
break_now: bool,
last_break: usize,
}
impl<'a> LineBreakCandidateIter<'a> {
@ -63,6 +64,7 @@ impl<'a> LineBreakCandidateIter<'a> {
iter: UnicodeSegmentation::grapheme_indices(text, true).peekable(),
reg_ind_streak: 0,
break_now: false,
last_break: 0,
}
}
}
@ -118,6 +120,20 @@ macro_rules! next_grapheme_class {
});
}
trait EvenAfterSpaces {
fn even_after_spaces(&self) -> &Self;
}
impl EvenAfterSpaces for str {
fn even_after_spaces(&self) -> &Self {
let mut ret = self;
while !ret.is_empty() && get_class!(&ret) != SP {
ret = &ret[get_base_character!(ret).unwrap().len_utf8()..];
}
ret
}
}
/// Returns positions where breaks can happen
/// Examples:
/// ```
@ -134,14 +150,24 @@ impl<'a> Iterator for LineBreakCandidateIter<'a> {
type Item = (usize, LineBreakCandidate);
fn next(&mut self) -> Option<Self::Item> {
loop {
macro_rules! set_last_break {
($last_break:expr, $pos:expr) => {
if $last_break == $pos {
continue;
}
$last_break = $pos;
};
};
// After end of text, there are no breaks.
if self.pos >= self.text.len() {
if self.pos > self.text.len() {
return None;
}
// LB3 Always break at the end of text
if self.pos + 1 == self.text.len() {
if self.pos == self.text.len() {
let ret = self.pos;
self.pos += 1;
return Some((self.pos, MandatoryBreak));
set_last_break!(self.last_break, ret);
return Some((ret, MandatoryBreak));
}
let LineBreakCandidateIter {
@ -149,6 +175,7 @@ impl<'a> Iterator for LineBreakCandidateIter<'a> {
ref text,
ref mut reg_ind_streak,
ref mut break_now,
ref mut last_break,
ref mut pos,
} = self;
let (idx, mut grapheme) = iter.next().unwrap();
@ -156,12 +183,6 @@ impl<'a> Iterator for LineBreakCandidateIter<'a> {
debug_assert_eq!(idx, *pos);
// LB2 Never break at the start of text
if idx == 0 {
*pos += grapheme.len();
continue;
}
let class = get_class!(grapheme);
if class != RI {
@ -184,12 +205,13 @@ impl<'a> Iterator for LineBreakCandidateIter<'a> {
// TODO: LB1
/* Check if next character class allows breaks before it */
let next_char: Option<&(usize, &str)> = iter.peek();
let mut next_char: Option<&(usize, &str)> = iter.peek();
match class {
BK => {
// LB4 Always Break after hard line breaks.
*pos += grapheme.len();
set_last_break!(*last_break, *pos);
return Some((*pos, MandatoryBreak));
}
// LB5 Treat CR followed by LF, as well as CR, LF, and NL as hard line breaks
@ -197,10 +219,12 @@ impl<'a> Iterator for LineBreakCandidateIter<'a> {
*pos += grapheme.len();
assert!(Some(LF) == next_grapheme_class!(iter, grapheme));
*pos += grapheme.len();
set_last_break!(*last_break, *pos);
return Some((*pos, MandatoryBreak));
}
CR | LF | NL => {
*pos += grapheme.len();
set_last_break!(*last_break, *pos);
return Some((*pos, MandatoryBreak));
}
_ => {}
@ -219,10 +243,21 @@ impl<'a> Iterator for LineBreakCandidateIter<'a> {
*pos += grapheme.len();
continue;
}
WJ => {
/*: LB11 Do not break before or after Word joiner and related characters.*/
*pos += grapheme.len();
continue;
}
_ if *break_now => {
*break_now = false;
let ret = *pos;
*pos += grapheme.len();
// LB2 Never break at the start of text
if ret == 0 {
continue;
}
set_last_break!(*last_break, ret);
return Some((ret, BreakAllowed));
}
_ => {}
@ -234,9 +269,13 @@ impl<'a> Iterator for LineBreakCandidateIter<'a> {
// spaces intervene
// ZW SP* Γ·
*pos += grapheme.len();
while Some(SP) == next_grapheme_class!(iter, grapheme) {
while next_grapheme_class!((next_char is SP)) {
let (_idx, grapheme) = iter.next().unwrap();
debug_assert_eq!(get_class!(grapheme), SP);
*pos += grapheme.len();
next_char = iter.peek();
}
set_last_break!(*last_break, *pos);
return Some((*pos, MandatoryBreak));
}
ZWJ => {
@ -292,8 +331,9 @@ impl<'a> Iterator for LineBreakCandidateIter<'a> {
match class {
/* LB13 Do not break before β€˜]’ or β€˜!’ or β€˜;’ or β€˜/’, even after spaces. */
SP if !text[idx..].trim_start().is_empty()
&& [CL, CP, EX, IS, SY].contains(&get_class!(text[idx..].trim_start())) =>
SP if !text[idx..].even_after_spaces().is_empty()
&& [CL, CP, EX, IS, SY]
.contains(&get_class!(text[idx..].even_after_spaces())) =>
{
*pos += grapheme.len();
while ![CL, CP, EX, IS, SY]
@ -308,22 +348,27 @@ impl<'a> Iterator for LineBreakCandidateIter<'a> {
/* LB14 Do not break after β€˜[’, even after spaces.
* OP SP* Γ—
*/
while let Some((idx, grapheme)) = self.iter.next() {
*pos = idx + grapheme.len();
if !(get_class!(grapheme) == SP) {
break;
}
*pos += grapheme.len();
while next_grapheme_class!((next_char is SP)) {
let (_idx, grapheme) = iter.next().unwrap();
debug_assert_eq!(get_class!(grapheme), SP);
*pos += grapheme.len();
next_char = iter.peek();
}
continue;
}
QU if get_class!(text[idx + grapheme.len()..].trim_start()) == OP => {
QU if !text[idx + grapheme.len()..].even_after_spaces().is_empty()
&& get_class!(text[idx + grapheme.len()..].even_after_spaces()) == OP =>
{
/* LB15 Do not break within β€˜β€[’, even with intervening spaces.
* QU SP* Γ— OP */
*pos += grapheme.len();
while Some(SP) == next_grapheme_class!(iter, grapheme) {
while next_grapheme_class!((next_char is SP)) {
let (_idx, grapheme) = iter.next().unwrap();
debug_assert_eq!(get_class!(grapheme), SP);
*pos += grapheme.len();
next_char = iter.peek();
}
*pos = idx;
continue;
}
QU => {
@ -334,7 +379,10 @@ impl<'a> Iterator for LineBreakCandidateIter<'a> {
}
continue;
}
CL | CP if get_class!(text[idx + grapheme.len()..].trim_start()) == NS => {
CL | CP
if !text[idx + grapheme.len()..].even_after_spaces().is_empty()
&& get_class!(text[idx + grapheme.len()..].even_after_spaces()) == NS =>
{
/* LB16 Do not break between closing punctuation and a nonstarter (lb=NS), even with
* intervening spaces.
* (CL | CP) SP* Γ— NS */
@ -344,15 +392,18 @@ impl<'a> Iterator for LineBreakCandidateIter<'a> {
}
continue;
}
B2 if get_class!(text[idx..].trim_start()) == B2 => {
B2 if !text[idx + grapheme.len()..].even_after_spaces().is_empty()
&& get_class!(text[idx + grapheme.len()..].even_after_spaces()) == B2 =>
{
/* LB17 Do not break within β€˜β€”β€”β€™, even with intervening spaces.
* B2 SP* Γ— B2*/
*pos += grapheme.len();
*break_now = true;
return Some((*pos, BreakAllowed));
continue;
}
SP => {
/* LB18 Break after spaces. SP Γ· */
// Space 0x20 is 1 byte long.
*pos += 1;
*pos += grapheme.len();
set_last_break!(*last_break, *pos);
return Some((*pos, BreakAllowed));
}
_ => {}
@ -372,12 +423,19 @@ impl<'a> Iterator for LineBreakCandidateIter<'a> {
match class {
CB => {
/* LB20 Break before and after unresolved CB. */
let ret = *pos;
*pos += grapheme.len();
return Some((*pos - 1, BreakAllowed));
*break_now = true;
// LB2 Never break at the start of text
if ret == 0 {
continue;
}
set_last_break!(*last_break, ret);
return Some((ret, BreakAllowed));
}
/* LB21 Do not break before hyphen-minus, other hyphens, fixed-width spaces, small
* kana, and other non-starters, or after acute accents. Γ— BA, Γ— HY, Γ— NS, BB Γ— */
BB => {
BB if !*break_now => {
*pos += grapheme.len();
continue;
}
@ -391,6 +449,7 @@ impl<'a> Iterator for LineBreakCandidateIter<'a> {
/* LB21 Do not break before hyphen-minus, other hyphens, fixed-width spaces, small
* kana, and other non-starters, or after acute accents. Γ— BA, Γ— HY, Γ— NS, BB Γ— */
*pos += grapheme.len();
//*pos += next_grapheme.len();
continue;
}
_ => {}
@ -696,17 +755,49 @@ impl<'a> Iterator for LineBreakCandidateIter<'a> {
*reg_ind_streak += 1;
*pos += grapheme.len();
if *reg_ind_streak % 2 == 1 {
return Some((*pos - grapheme.len(), BreakAllowed));
let ret = *pos - grapheme.len();
// LB2 Never break at the start of text
if ret == 0 {
continue;
}
set_last_break!(*last_break, ret);
return Some((ret, BreakAllowed));
}
self.iter.next();
continue;
}
_ if next_char.is_none() => {
return None;
CL | CP | IS | SY => {
*pos += grapheme.len();
continue;
}
BK | CR | LF | NL => {
*pos += grapheme.len();
continue;
}
SP | ZW => {
*pos += grapheme.len();
continue;
}
BA | HY | NS => {
*pos += grapheme.len();
continue;
}
_ => {
/* LB31 Break everywhere else.
* ALL Γ·
* Γ· ALL
*/
let ret = *pos;
// ALL Γ·
*break_now = true;
*pos += grapheme.len();
return Some((*pos - grapheme.len(), BreakAllowed));
// LB2 Never break at the start of text
if ret == 0 {
continue;
}
// Γ· ALL
set_last_break!(*last_break, ret);
return Some((ret, BreakAllowed));
}
}
}

Loading…
Cancel
Save