From b3b9563db0414e4a483f158585dac9d3e7b0a91c Mon Sep 17 00:00:00 2001 From: Manos Pitsidianakis Date: Sun, 31 May 2020 01:08:22 +0300 Subject: [PATCH] LineBreakCandidateIter: make iter non-recursive A line with lots of graphemes without any breaks can overflow the stack, so make the recursion into a loop. --- melib/src/text_processing/line_break.rs | 1131 ++++++++++++----------- 1 file changed, 567 insertions(+), 564 deletions(-) diff --git a/melib/src/text_processing/line_break.rs b/melib/src/text_processing/line_break.rs index 7ba171c10..8e00e47e9 100644 --- a/melib/src/text_processing/line_break.rs +++ b/melib/src/text_processing/line_break.rs @@ -130,573 +130,576 @@ macro_rules! next_grapheme_class { impl<'a> Iterator for LineBreakCandidateIter<'a> { type Item = (usize, LineBreakCandidate); fn next(&mut self) -> Option { - // After end of text, there are no breaks. - if self.pos >= self.text.len() { - return None; - } - // LB3 Always break at the end of text - if self.pos + 1 == self.text.len() { - self.pos += 1; - return Some((self.pos, MandatoryBreak)); - } - - let (idx, mut grapheme) = self.iter.next().unwrap(); - let LineBreakCandidateIter { - ref mut iter, - ref text, - ref mut reg_ind_streak, - ref mut pos, - } = self; - let iter = iter.by_ref(); - - debug_assert_eq!(idx, *pos); - - // LB2 Never break at the start of text - if idx == 0 { - *pos += grapheme.len(); - return self.next(); - } - - let class = get_class!(grapheme); - - if class != RI { - *reg_ind_streak = 0; - } - - /* LB1 Assign a line breaking class to each code point of the input. Resolve AI, CB, CJ, - * SA, SG, and XX into other line breaking classes depending on criteria outside the scope - * of this algorithm. - * - * In the absence of such criteria all characters with a specific combination of original - * class and General_Category property value are resolved as follows: - * Resolved Original General_Category - * AL AI, SG, XX Any - * CM SA Only Mn or Mc - * AL SA Any except Mn and Mc - * NS SJ Any - */ - - // TODO: LB1 - - /* Check if next character class allows breaks before it */ - let next_char: Option<&(usize, &str)> = iter.peek(); - - match class { - BK => { - // LB4 Always Break after hard line breaks. - *pos += grapheme.len(); - return Some((*pos, MandatoryBreak)); - } - // LB5 Treat CR followed by LF, as well as CR, LF, and NL as hard line breaks - CR if next_grapheme_class!((next_char is LF)) => { - *pos += grapheme.len(); - assert!(Some(LF) == next_grapheme_class!(iter, grapheme)); - *pos += grapheme.len(); - return Some((*pos, MandatoryBreak)); - } - CR | LF | NL => { - *pos += grapheme.len(); - return Some((*pos, MandatoryBreak)); - } - _ => {} - } - if let Some((_, next_grapheme)) = next_char { - let next_class = get_class!(next_grapheme); - match next_class { - /* LB6 Do not break before hard line breaks. × ( BK | CR | LF | NL ) */ - BK | CR | LF | NL => { - *pos += grapheme.len(); - return self.next(); - } - /* LB7 Do not break before spaces or zero width - * space. × SP × ZW */ - SP | ZW => { - *pos += grapheme.len(); - return self.next(); - } - _ => {} - } - } - match class { - ZW => { - // LB8 Break before any character following a zero-width space, even if one or more - // spaces intervene - // ZW SP* ÷ - *pos += grapheme.len(); - while Some(SP) == next_grapheme_class!(iter, grapheme) { - *pos += grapheme.len(); - } - return Some((*pos, MandatoryBreak)); - } - ZWJ => { - // LB8a Do not break after a zero width joiner. - *pos += grapheme.len(); - return self.next(); - } - - CM => { - // LB9 Do not break a combining character sequence; treat it as if it has the line - // breaking class of the base character in all of the following rules. Treat ZWJ as - // if it were CM. - // Treat X (CM | ZWJ)* as if it were X. - // where X is any line break class except BK, CR, LF, NL, SP, or ZW. - - *pos += grapheme.len(); - return self.next(); - } - WJ => { - /*: LB11 Do not break before or after Word joiner and related characters.*/ - *pos += grapheme.len(); - /* Get next grapheme */ - if next_grapheme_class!(iter, grapheme).is_some() { - *pos += grapheme.len(); - } - return self.next(); - } - GL => { - /*LB12 Non-breaking characters: LB12 Do not break after NBSP and related characters.*/ - *pos += grapheme.len(); - return self.next(); - } - _ => {} - } - if let Some((next_idx, next_grapheme)) = next_char { - let next_class = get_class!(next_grapheme); - match next_class { - GL if ![SP, BA, HY].contains(&class) => { - /* LB12a Do not break before NBSP and related characters, except after spaces and - * hyphens. [^SP BA HY] × GL - * Also LB12 Do not break after NBSP and related characters */ - *pos += grapheme.len(); - return self.next(); - } - /* LB13 Do not break before ‘]’ or ‘!’ or ‘;’ or ‘/’, even after spaces. */ - CL | CP | EX | IS | SY => { - *pos = *next_idx; - return self.next(); - } - _ => {} - } - } - - match class { - /* LB13 Do not break before ‘]’ or ‘!’ or ‘;’ or ‘/’, even after spaces. */ - SP if [CL, CP, EX, IS, SY].contains(&get_class!(text[idx..].trim_start())) => { - *pos += grapheme.len(); - while ![CL, CP, EX, IS, SY].contains(&next_grapheme_class!(iter, grapheme).unwrap()) - { - *pos += grapheme.len(); - } - *pos += grapheme.len(); - return self.next(); - } - OP => { - /* LB14 Do not break after ‘[’, even after spaces. - * OP SP* × - */ - while let Some((idx, grapheme)) = self.iter.next() { - *pos = idx + grapheme.len(); - if !(get_class!(grapheme) == SP) { - break; - } - } - return self.next(); - } - QU if get_class!(text[idx..].trim_start()) == OP => { - /* LB15 Do not break within ‘”[’, even with intervening spaces. - * QU SP* × OP */ - *pos += grapheme.len(); - while Some(SP) == next_grapheme_class!(iter, grapheme) { - *pos += grapheme.len(); - } - *pos = idx; - return self.next(); - } - QU => { - /* LB19 Do not break before or after quotation marks, such as ‘ ” ’. */ - *pos += grapheme.len(); - if let Some((_, g)) = self.iter.next() { - *pos += g.len(); - } - return self.next(); - } - LineBreakClass::CL | LineBreakClass::CP - if get_class!(text[idx..].trim_start()) == NS => - { - /* LB16 Do not break between closing punctuation and a nonstarter (lb=NS), even with - * intervening spaces. - * (CL | CP) SP* × NS */ - *pos += grapheme.len(); - while Some(SP) == next_grapheme_class!(iter, grapheme) { - *pos += grapheme.len(); - } - return self.next(); - } - B2 if get_class!(text[idx..].trim_start()) == B2 => { - *pos += grapheme.len(); - while Some(SP) == next_grapheme_class!(iter, grapheme) { - *pos += grapheme.len(); - } - return self.next(); - } - SP => { - /* LB18 Break after spaces. SP ÷ */ - // Space 0x20 is 1 byte long. - *pos += 1; - return Some((*pos, BreakAllowed)); - } - _ => {} - } - if let Some((next_idx, next_grapheme)) = next_char { - let next_class = get_class!(next_grapheme); - match next_class { - QU if class != SP => { - /* LB19 Do not break before or after quotation marks, such as ‘ ” ’. */ - *pos = *next_idx + next_grapheme.len(); - self.iter.next(); - return self.next(); - } - _ => {} - } - } - match class { - CB => { - /* LB20 Break before and after unresolved CB. */ - *pos += grapheme.len(); - return Some((*pos - 1, BreakAllowed)); - } - /* LB21 Do not break before hyphen-minus, other hyphens, fixed-width spaces, small - * kana, and other non-starters, or after acute accents. × BA, × HY, × NS, BB × */ - BB => { - *pos += grapheme.len(); - return self.next(); - } - _ => {} - } - - if let Some((_, next_grapheme)) = next_char { - let next_class = get_class!(next_grapheme); - match next_class { - BA | HY | NS => { - /* LB21 Do not break before hyphen-minus, other hyphens, fixed-width spaces, small - * kana, and other non-starters, or after acute accents. × BA, × HY, × NS, BB × */ - *pos += grapheme.len(); - return self.next(); - } - _ => {} - } - } - match class { - HL if next_grapheme_class!((next_char is HY, BA)) => { - /* LB21a Don’t break after Hebrew + Hyphen. HL (HY | BA) × */ - let (idx, next_grapheme) = next_char.unwrap(); - *pos = idx + next_grapheme.len(); - self.iter.next(); - return self.next(); - } - /* LB21b Don’t break between ,Solidus and Hebrew letters. SY × HL */ - SY if next_grapheme_class!((next_char is HL)) => { - let (idx, next_grapheme) = next_char.unwrap(); - *pos = idx + next_grapheme.len(); - /* bypass next_char */ - self.iter.next().unwrap(); - if let Some((idx, next_grapheme)) = self.iter.next() { - *pos = idx + next_grapheme.len(); - } - return self.next(); - } - /* LB22 Do not break between two ellipses, or between letters, numbers or excla- - * mations and ellipsis. - * Examples: ‘9...’, ‘a...’, ‘H...’ - * (AL | HL) × IN */ - AL | HL if next_grapheme_class!((next_char is IN)) => { - let (idx, next_grapheme) = next_char.unwrap(); - *pos = idx + next_grapheme.len(); - self.iter.next(); - return self.next(); - } - /* EX × IN */ - EX if next_grapheme_class!((next_char is IN)) => { - let (idx, next_grapheme) = next_char.unwrap(); - *pos = idx + next_grapheme.len(); - self.iter.next(); - return self.next(); - } - EX => { - // LB13 - *pos += grapheme.len(); - return self.next(); - } - /* (ID | EB | EM) × IN */ - ID | EB | EM if next_grapheme_class!((next_char is IN)) => { - let (idx, next_grapheme) = next_char.unwrap(); - *pos = idx + next_grapheme.len(); - self.iter.next(); - return self.next(); - } - /* IN × IN */ - IN if next_grapheme_class!((next_char is IN)) => { - let (idx, next_grapheme) = next_char.unwrap(); - *pos = idx + next_grapheme.len(); - self.iter.next(); - return self.next(); - } - /* NU × IN */ - NU if next_grapheme_class!((next_char is IN)) => { - let (idx, next_grapheme) = next_char.unwrap(); - *pos = idx + next_grapheme.len(); - self.iter.next(); - return self.next(); - } - /* LB23 Do not break between digits and letters. - * (AL | HL) × NU */ - AL | HL if next_grapheme_class!((next_char is NU)) => { - let (idx, next_grapheme) = next_char.unwrap(); - *pos = idx + next_grapheme.len(); - self.iter.next(); - return self.next(); - } - /* NU × (AL | HL) */ - NU if next_grapheme_class!((next_char is AL, HL)) => { - let (idx, next_grapheme) = next_char.unwrap(); - *pos = idx + next_grapheme.len(); - self.iter.next(); - return self.next(); - } - /* LB23a Do not break between numeric prefixes and ideographs, or between ideographs - * and numeric postfixes. - * PR × (ID | EB | EM) */ - PR if next_grapheme_class!((next_char is ID, EB, EM)) => { - let (idx, next_grapheme) = next_char.unwrap(); - *pos = idx + next_grapheme.len(); - self.iter.next(); - return self.next(); - } - /* (ID | EB | EM) × PO */ - ID | EB | EM if next_grapheme_class!((next_char is PO)) => { - let (idx, next_grapheme) = next_char.unwrap(); - *pos = idx + next_grapheme.len(); - self.iter.next(); - return self.next(); - } - /* B24 Do not break between numeric prefix/postfix and letters, or between - letters and prefix/postfix. - (PR | PO) × (AL | HL)*/ - PR | PO if next_grapheme_class!((next_char is AL, HL)) => { - let (idx, next_grapheme) = next_char.unwrap(); - *pos = idx + next_grapheme.len(); - self.iter.next(); - return self.next(); - } - /*(AL | HL) × (PR | PO) */ - AL | HL if next_grapheme_class!((next_char is PR, PO)) => { - let (idx, next_grapheme) = next_char.unwrap(); - *pos = idx + next_grapheme.len(); - self.iter.next(); - return self.next(); - } - /* LB25 Do not break between the following pairs of classes relevant to numbers: - * CL × PO */ - CL if next_grapheme_class!((next_char is PO)) => { - let (idx, next_grapheme) = next_char.unwrap(); - *pos = idx + next_grapheme.len(); - self.iter.next(); - return self.next(); - } - /* CP × PO */ - CP if next_grapheme_class!((next_char is PO)) => { - let (idx, next_grapheme) = next_char.unwrap(); - *pos = idx + next_grapheme.len(); - self.iter.next(); - return self.next(); - } - /* CL × PR */ - CL if next_grapheme_class!((next_char is PR)) => { - let (idx, next_grapheme) = next_char.unwrap(); - *pos = idx + next_grapheme.len(); - self.iter.next(); - return self.next(); - } - /* CP × PR */ - CP if next_grapheme_class!((next_char is PR)) => { - let (idx, next_grapheme) = next_char.unwrap(); - *pos = idx + next_grapheme.len(); - self.iter.next(); - return self.next(); - } - /* NU × PO */ - NU if next_grapheme_class!((next_char is PO)) => { - let (idx, next_grapheme) = next_char.unwrap(); - *pos = idx + next_grapheme.len(); - self.iter.next(); - return self.next(); - } - /* NU × PR */ - NU if next_grapheme_class!((next_char is PR)) => { - let (idx, next_grapheme) = next_char.unwrap(); - *pos = idx + next_grapheme.len(); - self.iter.next(); - return self.next(); - } - /* PO × OP */ - PO if next_grapheme_class!((next_char is OP)) => { - let (idx, next_grapheme) = next_char.unwrap(); - *pos = idx + next_grapheme.len(); - self.iter.next(); - return self.next(); - } - /* PO × NU */ - PO if next_grapheme_class!((next_char is NU)) => { - let (idx, next_grapheme) = next_char.unwrap(); - *pos = idx + next_grapheme.len(); - self.iter.next(); - return self.next(); - } - /* PR × OP */ - PR if next_grapheme_class!((next_char is OP)) => { - let (idx, next_grapheme) = next_char.unwrap(); - *pos = idx + next_grapheme.len(); - self.iter.next(); - return self.next(); - } - /* PR × NU */ - PR if next_grapheme_class!((next_char is NU)) => { - let (idx, next_grapheme) = next_char.unwrap(); - *pos = idx + next_grapheme.len(); - self.iter.next(); - return self.next(); - } - /* HY × NU */ - HY if next_grapheme_class!((next_char is NU)) => { - let (idx, next_grapheme) = next_char.unwrap(); - *pos = idx + next_grapheme.len(); - self.iter.next(); - return self.next(); - } - /* IS × NU */ - IS if next_grapheme_class!((next_char is NU)) => { - let (idx, next_grapheme) = next_char.unwrap(); - *pos = idx + next_grapheme.len(); - self.iter.next(); - return self.next(); - } - /* NU × NU */ - NU if next_grapheme_class!((next_char is NU)) => { - let (idx, next_grapheme) = next_char.unwrap(); - *pos = idx + next_grapheme.len(); - self.iter.next(); - return self.next(); - } - /* SY × NU */ - SY if next_grapheme_class!((next_char is NU)) => { - let (idx, next_grapheme) = next_char.unwrap(); - *pos = idx + next_grapheme.len(); - self.iter.next(); - return self.next(); - } - /* LB26 Do not break a Korean syllable. - * JL × (JL | JV | H2 | H3) */ - JL if next_grapheme_class!((next_char is JL, JV, H2, H3)) => { - let (idx, next_grapheme) = next_char.unwrap(); - *pos = idx + next_grapheme.len(); - self.iter.next(); - return self.next(); - } - /* (JV | H2) × (JV | JT) */ - JV | H2 if next_grapheme_class!((next_char is JV, JT)) => { - let (idx, next_grapheme) = next_char.unwrap(); - *pos = idx + next_grapheme.len(); - self.iter.next(); - return self.next(); - } - /* (JT | H3) × JT */ - JT | H3 if next_grapheme_class!((next_char is JT)) => { - let (idx, next_grapheme) = next_char.unwrap(); - *pos = idx + next_grapheme.len(); - self.iter.next(); - return self.next(); - } - /* LB27 Treat a Korean Syllable Block the same as ID. - * (JL | JV | JT | H2 | H3) × IN */ - JL | JV | JT | H2 | H3 if next_grapheme_class!((next_char is IN)) => { - let (idx, next_grapheme) = next_char.unwrap(); - *pos = idx + next_grapheme.len(); - self.iter.next(); - return self.next(); - } - /* (JL | JV | JT | H2 | H3) × PO */ - JL | JV | JT | H2 | H3 if next_grapheme_class!((next_char is PO)) => { - let (idx, next_grapheme) = next_char.unwrap(); - *pos = idx + next_grapheme.len(); - self.iter.next(); - return self.next(); - } - /* PR × (JL | JV | JT | H2 | H3) */ - PR if next_grapheme_class!((next_char is JL, JV, JT, H2, H3)) => { - let (idx, next_grapheme) = next_char.unwrap(); - *pos = idx + next_grapheme.len(); - self.iter.next(); - return self.next(); - } - /* LB28 Do not break between alphabetics (“at”). - (AL | HL) × (AL | HL) */ - AL | HL if next_grapheme_class!((next_char is AL, HL)) => { - let (idx, next_grapheme) = next_char.unwrap(); - *pos = idx + next_grapheme.len(); - self.iter.next(); - return self.next(); - } - /* LB29 Do not break between numeric punctuation and alphabetics (“e.g.”). - IS × (AL | HL) */ - IS if next_grapheme_class!((next_char is AL, HL)) => { - let (idx, next_grapheme) = next_char.unwrap(); - *pos = idx + next_grapheme.len(); - self.iter.next(); - return self.next(); - } - /* LB30 Do not break between letters, numbers, or ordinary symbols and opening - or closing parentheses. - (AL | HL | NU) × OP */ - AL | HL | NU if next_grapheme_class!((next_char is OP)) => { - let (idx, next_grapheme) = next_char.unwrap(); - *pos = idx + next_grapheme.len(); - self.iter.next(); - return self.next(); - } - /* CP × (AL | HL | NU) */ - CP if next_grapheme_class!((next_char is AL, HL , NU)) => { - let (idx, next_grapheme) = next_char.unwrap(); - *pos = idx + next_grapheme.len(); - self.iter.next(); - return self.next(); - } - /*LB30b Do not break between an emoji base and an emoji modifier. - * EB × EM */ - EB if next_grapheme_class!((next_char is EM)) => { - let (idx, next_grapheme) = next_char.unwrap(); - *pos = idx + next_grapheme.len(); - self.iter.next(); - return self.next(); - } - RI => { - /* LB30a Break between two regional indicator symbols if and only if there are an - * even number of regional indicators preceding the position of the break. - * sot (RI RI)* RI × RI - * [^RI] (RI RI)* RI × RI */ - *reg_ind_streak += 1; - *pos += grapheme.len(); - if *reg_ind_streak % 2 == 1 { - return Some((*pos - grapheme.len(), BreakAllowed)); - } - self.iter.next(); - return self.next(); - } - _ if next_char.is_none() => { + loop { + // After end of text, there are no breaks. + if self.pos >= self.text.len() { return None; } - _ => { + // LB3 Always break at the end of text + if self.pos + 1 == self.text.len() { + self.pos += 1; + return Some((self.pos, MandatoryBreak)); + } + + let LineBreakCandidateIter { + ref mut iter, + ref text, + ref mut reg_ind_streak, + ref mut pos, + } = self; + let (idx, mut grapheme) = iter.next().unwrap(); + let iter = iter.by_ref(); + + debug_assert_eq!(idx, *pos); + + // LB2 Never break at the start of text + if idx == 0 { *pos += grapheme.len(); - return Some((*pos - grapheme.len(), BreakAllowed)); + continue; + } + + let class = get_class!(grapheme); + + if class != RI { + *reg_ind_streak = 0; + } + + /* LB1 Assign a line breaking class to each code point of the input. Resolve AI, CB, CJ, + * SA, SG, and XX into other line breaking classes depending on criteria outside the scope + * of this algorithm. + * + * In the absence of such criteria all characters with a specific combination of original + * class and General_Category property value are resolved as follows: + * Resolved Original General_Category + * AL AI, SG, XX Any + * CM SA Only Mn or Mc + * AL SA Any except Mn and Mc + * NS SJ Any + */ + + // TODO: LB1 + + /* Check if next character class allows breaks before it */ + let next_char: Option<&(usize, &str)> = iter.peek(); + + match class { + BK => { + // LB4 Always Break after hard line breaks. + *pos += grapheme.len(); + return Some((*pos, MandatoryBreak)); + } + // LB5 Treat CR followed by LF, as well as CR, LF, and NL as hard line breaks + CR if next_grapheme_class!((next_char is LF)) => { + *pos += grapheme.len(); + assert!(Some(LF) == next_grapheme_class!(iter, grapheme)); + *pos += grapheme.len(); + return Some((*pos, MandatoryBreak)); + } + CR | LF | NL => { + *pos += grapheme.len(); + return Some((*pos, MandatoryBreak)); + } + _ => {} + } + if let Some((_, next_grapheme)) = next_char { + let next_class = get_class!(next_grapheme); + match next_class { + /* LB6 Do not break before hard line breaks. × ( BK | CR | LF | NL ) */ + BK | CR | LF | NL => { + *pos += grapheme.len(); + continue; + } + /* LB7 Do not break before spaces or zero width + * space. × SP × ZW */ + SP | ZW => { + *pos += grapheme.len(); + continue; + } + _ => {} + } + } + match class { + ZW => { + // LB8 Break before any character following a zero-width space, even if one or more + // spaces intervene + // ZW SP* ÷ + *pos += grapheme.len(); + while Some(SP) == next_grapheme_class!(iter, grapheme) { + *pos += grapheme.len(); + } + return Some((*pos, MandatoryBreak)); + } + ZWJ => { + // LB8a Do not break after a zero width joiner. + *pos += grapheme.len(); + continue; + } + + CM => { + // LB9 Do not break a combining character sequence; treat it as if it has the line + // breaking class of the base character in all of the following rules. Treat ZWJ as + // if it were CM. + // Treat X (CM | ZWJ)* as if it were X. + // where X is any line break class except BK, CR, LF, NL, SP, or ZW. + + *pos += grapheme.len(); + continue; + } + WJ => { + /*: LB11 Do not break before or after Word joiner and related characters.*/ + *pos += grapheme.len(); + /* Get next grapheme */ + if next_grapheme_class!(iter, grapheme).is_some() { + *pos += grapheme.len(); + } + continue; + } + GL => { + /*LB12 Non-breaking characters: LB12 Do not break after NBSP and related characters.*/ + *pos += grapheme.len(); + continue; + } + _ => {} + } + if let Some((next_idx, next_grapheme)) = next_char { + let next_class = get_class!(next_grapheme); + match next_class { + GL if ![SP, BA, HY].contains(&class) => { + /* LB12a Do not break before NBSP and related characters, except after spaces and + * hyphens. [^SP BA HY] × GL + * Also LB12 Do not break after NBSP and related characters */ + *pos += grapheme.len(); + continue; + } + /* LB13 Do not break before ‘]’ or ‘!’ or ‘;’ or ‘/’, even after spaces. */ + CL | CP | EX | IS | SY => { + *pos = *next_idx; + continue; + } + _ => {} + } + } + + match class { + /* LB13 Do not break before ‘]’ or ‘!’ or ‘;’ or ‘/’, even after spaces. */ + SP if [CL, CP, EX, IS, SY].contains(&get_class!(text[idx..].trim_start())) => { + *pos += grapheme.len(); + while ![CL, CP, EX, IS, SY] + .contains(&next_grapheme_class!(iter, grapheme).unwrap()) + { + *pos += grapheme.len(); + } + *pos += grapheme.len(); + continue; + } + OP => { + /* LB14 Do not break after ‘[’, even after spaces. + * OP SP* × + */ + while let Some((idx, grapheme)) = self.iter.next() { + *pos = idx + grapheme.len(); + if !(get_class!(grapheme) == SP) { + break; + } + } + continue; + } + QU if get_class!(text[idx..].trim_start()) == OP => { + /* LB15 Do not break within ‘”[’, even with intervening spaces. + * QU SP* × OP */ + *pos += grapheme.len(); + while Some(SP) == next_grapheme_class!(iter, grapheme) { + *pos += grapheme.len(); + } + *pos = idx; + continue; + } + QU => { + /* LB19 Do not break before or after quotation marks, such as ‘ ” ’. */ + *pos += grapheme.len(); + if let Some((_, g)) = self.iter.next() { + *pos += g.len(); + } + continue; + } + LineBreakClass::CL | LineBreakClass::CP + if get_class!(text[idx..].trim_start()) == NS => + { + /* LB16 Do not break between closing punctuation and a nonstarter (lb=NS), even with + * intervening spaces. + * (CL | CP) SP* × NS */ + *pos += grapheme.len(); + while Some(SP) == next_grapheme_class!(iter, grapheme) { + *pos += grapheme.len(); + } + continue; + } + B2 if get_class!(text[idx..].trim_start()) == B2 => { + *pos += grapheme.len(); + while Some(SP) == next_grapheme_class!(iter, grapheme) { + *pos += grapheme.len(); + } + continue; + } + SP => { + /* LB18 Break after spaces. SP ÷ */ + // Space 0x20 is 1 byte long. + *pos += 1; + return Some((*pos, BreakAllowed)); + } + _ => {} + } + if let Some((next_idx, next_grapheme)) = next_char { + let next_class = get_class!(next_grapheme); + match next_class { + QU if class != SP => { + /* LB19 Do not break before or after quotation marks, such as ‘ ” ’. */ + *pos = *next_idx + next_grapheme.len(); + self.iter.next(); + continue; + } + _ => {} + } + } + match class { + CB => { + /* LB20 Break before and after unresolved CB. */ + *pos += grapheme.len(); + return Some((*pos - 1, BreakAllowed)); + } + /* LB21 Do not break before hyphen-minus, other hyphens, fixed-width spaces, small + * kana, and other non-starters, or after acute accents. × BA, × HY, × NS, BB × */ + BB => { + *pos += grapheme.len(); + continue; + } + _ => {} + } + + if let Some((_, next_grapheme)) = next_char { + let next_class = get_class!(next_grapheme); + match next_class { + BA | HY | NS => { + /* LB21 Do not break before hyphen-minus, other hyphens, fixed-width spaces, small + * kana, and other non-starters, or after acute accents. × BA, × HY, × NS, BB × */ + *pos += grapheme.len(); + continue; + } + _ => {} + } + } + match class { + HL if next_grapheme_class!((next_char is HY, BA)) => { + /* LB21a Don’t break after Hebrew + Hyphen. HL (HY | BA) × */ + let (idx, next_grapheme) = next_char.unwrap(); + *pos = idx + next_grapheme.len(); + self.iter.next(); + continue; + } + /* LB21b Don’t break between ,Solidus and Hebrew letters. SY × HL */ + SY if next_grapheme_class!((next_char is HL)) => { + let (idx, next_grapheme) = next_char.unwrap(); + *pos = idx + next_grapheme.len(); + /* bypass next_char */ + self.iter.next().unwrap(); + if let Some((idx, next_grapheme)) = self.iter.next() { + *pos = idx + next_grapheme.len(); + } + continue; + } + /* LB22 Do not break between two ellipses, or between letters, numbers or excla- + * mations and ellipsis. + * Examples: ‘9...’, ‘a...’, ‘H...’ + * (AL | HL) × IN */ + AL | HL if next_grapheme_class!((next_char is IN)) => { + let (idx, next_grapheme) = next_char.unwrap(); + *pos = idx + next_grapheme.len(); + self.iter.next(); + continue; + } + /* EX × IN */ + EX if next_grapheme_class!((next_char is IN)) => { + let (idx, next_grapheme) = next_char.unwrap(); + *pos = idx + next_grapheme.len(); + self.iter.next(); + continue; + } + EX => { + // LB13 + *pos += grapheme.len(); + continue; + } + /* (ID | EB | EM) × IN */ + ID | EB | EM if next_grapheme_class!((next_char is IN)) => { + let (idx, next_grapheme) = next_char.unwrap(); + *pos = idx + next_grapheme.len(); + self.iter.next(); + continue; + } + /* IN × IN */ + IN if next_grapheme_class!((next_char is IN)) => { + let (idx, next_grapheme) = next_char.unwrap(); + *pos = idx + next_grapheme.len(); + self.iter.next(); + continue; + } + /* NU × IN */ + NU if next_grapheme_class!((next_char is IN)) => { + let (idx, next_grapheme) = next_char.unwrap(); + *pos = idx + next_grapheme.len(); + self.iter.next(); + continue; + } + /* LB23 Do not break between digits and letters. + * (AL | HL) × NU */ + AL | HL if next_grapheme_class!((next_char is NU)) => { + let (idx, next_grapheme) = next_char.unwrap(); + *pos = idx + next_grapheme.len(); + self.iter.next(); + continue; + } + /* NU × (AL | HL) */ + NU if next_grapheme_class!((next_char is AL, HL)) => { + let (idx, next_grapheme) = next_char.unwrap(); + *pos = idx + next_grapheme.len(); + self.iter.next(); + continue; + } + /* LB23a Do not break between numeric prefixes and ideographs, or between ideographs + * and numeric postfixes. + * PR × (ID | EB | EM) */ + PR if next_grapheme_class!((next_char is ID, EB, EM)) => { + let (idx, next_grapheme) = next_char.unwrap(); + *pos = idx + next_grapheme.len(); + self.iter.next(); + continue; + } + /* (ID | EB | EM) × PO */ + ID | EB | EM if next_grapheme_class!((next_char is PO)) => { + let (idx, next_grapheme) = next_char.unwrap(); + *pos = idx + next_grapheme.len(); + self.iter.next(); + continue; + } + /* B24 Do not break between numeric prefix/postfix and letters, or between + letters and prefix/postfix. + (PR | PO) × (AL | HL)*/ + PR | PO if next_grapheme_class!((next_char is AL, HL)) => { + let (idx, next_grapheme) = next_char.unwrap(); + *pos = idx + next_grapheme.len(); + self.iter.next(); + continue; + } + /*(AL | HL) × (PR | PO) */ + AL | HL if next_grapheme_class!((next_char is PR, PO)) => { + let (idx, next_grapheme) = next_char.unwrap(); + *pos = idx + next_grapheme.len(); + self.iter.next(); + continue; + } + /* LB25 Do not break between the following pairs of classes relevant to numbers: + * CL × PO */ + CL if next_grapheme_class!((next_char is PO)) => { + let (idx, next_grapheme) = next_char.unwrap(); + *pos = idx + next_grapheme.len(); + self.iter.next(); + continue; + } + /* CP × PO */ + CP if next_grapheme_class!((next_char is PO)) => { + let (idx, next_grapheme) = next_char.unwrap(); + *pos = idx + next_grapheme.len(); + self.iter.next(); + continue; + } + /* CL × PR */ + CL if next_grapheme_class!((next_char is PR)) => { + let (idx, next_grapheme) = next_char.unwrap(); + *pos = idx + next_grapheme.len(); + self.iter.next(); + continue; + } + /* CP × PR */ + CP if next_grapheme_class!((next_char is PR)) => { + let (idx, next_grapheme) = next_char.unwrap(); + *pos = idx + next_grapheme.len(); + self.iter.next(); + continue; + } + /* NU × PO */ + NU if next_grapheme_class!((next_char is PO)) => { + let (idx, next_grapheme) = next_char.unwrap(); + *pos = idx + next_grapheme.len(); + self.iter.next(); + continue; + } + /* NU × PR */ + NU if next_grapheme_class!((next_char is PR)) => { + let (idx, next_grapheme) = next_char.unwrap(); + *pos = idx + next_grapheme.len(); + self.iter.next(); + continue; + } + /* PO × OP */ + PO if next_grapheme_class!((next_char is OP)) => { + let (idx, next_grapheme) = next_char.unwrap(); + *pos = idx + next_grapheme.len(); + self.iter.next(); + continue; + } + /* PO × NU */ + PO if next_grapheme_class!((next_char is NU)) => { + let (idx, next_grapheme) = next_char.unwrap(); + *pos = idx + next_grapheme.len(); + self.iter.next(); + continue; + } + /* PR × OP */ + PR if next_grapheme_class!((next_char is OP)) => { + let (idx, next_grapheme) = next_char.unwrap(); + *pos = idx + next_grapheme.len(); + self.iter.next(); + continue; + } + /* PR × NU */ + PR if next_grapheme_class!((next_char is NU)) => { + let (idx, next_grapheme) = next_char.unwrap(); + *pos = idx + next_grapheme.len(); + self.iter.next(); + continue; + } + /* HY × NU */ + HY if next_grapheme_class!((next_char is NU)) => { + let (idx, next_grapheme) = next_char.unwrap(); + *pos = idx + next_grapheme.len(); + self.iter.next(); + continue; + } + /* IS × NU */ + IS if next_grapheme_class!((next_char is NU)) => { + let (idx, next_grapheme) = next_char.unwrap(); + *pos = idx + next_grapheme.len(); + self.iter.next(); + continue; + } + /* NU × NU */ + NU if next_grapheme_class!((next_char is NU)) => { + let (idx, next_grapheme) = next_char.unwrap(); + *pos = idx + next_grapheme.len(); + self.iter.next(); + continue; + } + /* SY × NU */ + SY if next_grapheme_class!((next_char is NU)) => { + let (idx, next_grapheme) = next_char.unwrap(); + *pos = idx + next_grapheme.len(); + self.iter.next(); + continue; + } + /* LB26 Do not break a Korean syllable. + * JL × (JL | JV | H2 | H3) */ + JL if next_grapheme_class!((next_char is JL, JV, H2, H3)) => { + let (idx, next_grapheme) = next_char.unwrap(); + *pos = idx + next_grapheme.len(); + self.iter.next(); + continue; + } + /* (JV | H2) × (JV | JT) */ + JV | H2 if next_grapheme_class!((next_char is JV, JT)) => { + let (idx, next_grapheme) = next_char.unwrap(); + *pos = idx + next_grapheme.len(); + self.iter.next(); + continue; + } + /* (JT | H3) × JT */ + JT | H3 if next_grapheme_class!((next_char is JT)) => { + let (idx, next_grapheme) = next_char.unwrap(); + *pos = idx + next_grapheme.len(); + self.iter.next(); + continue; + } + /* LB27 Treat a Korean Syllable Block the same as ID. + * (JL | JV | JT | H2 | H3) × IN */ + JL | JV | JT | H2 | H3 if next_grapheme_class!((next_char is IN)) => { + let (idx, next_grapheme) = next_char.unwrap(); + *pos = idx + next_grapheme.len(); + self.iter.next(); + continue; + } + /* (JL | JV | JT | H2 | H3) × PO */ + JL | JV | JT | H2 | H3 if next_grapheme_class!((next_char is PO)) => { + let (idx, next_grapheme) = next_char.unwrap(); + *pos = idx + next_grapheme.len(); + self.iter.next(); + continue; + } + /* PR × (JL | JV | JT | H2 | H3) */ + PR if next_grapheme_class!((next_char is JL, JV, JT, H2, H3)) => { + let (idx, next_grapheme) = next_char.unwrap(); + *pos = idx + next_grapheme.len(); + self.iter.next(); + continue; + } + /* LB28 Do not break between alphabetics (“at”). + (AL | HL) × (AL | HL) */ + AL | HL if next_grapheme_class!((next_char is AL, HL)) => { + let (idx, next_grapheme) = next_char.unwrap(); + *pos = idx + next_grapheme.len(); + self.iter.next(); + continue; + } + /* LB29 Do not break between numeric punctuation and alphabetics (“e.g.”). + IS × (AL | HL) */ + IS if next_grapheme_class!((next_char is AL, HL)) => { + let (idx, next_grapheme) = next_char.unwrap(); + *pos = idx + next_grapheme.len(); + self.iter.next(); + continue; + } + /* LB30 Do not break between letters, numbers, or ordinary symbols and opening + or closing parentheses. + (AL | HL | NU) × OP */ + AL | HL | NU if next_grapheme_class!((next_char is OP)) => { + let (idx, next_grapheme) = next_char.unwrap(); + *pos = idx + next_grapheme.len(); + self.iter.next(); + continue; + } + /* CP × (AL | HL | NU) */ + CP if next_grapheme_class!((next_char is AL, HL , NU)) => { + let (idx, next_grapheme) = next_char.unwrap(); + *pos = idx + next_grapheme.len(); + self.iter.next(); + continue; + } + /*LB30b Do not break between an emoji base and an emoji modifier. + * EB × EM */ + EB if next_grapheme_class!((next_char is EM)) => { + let (idx, next_grapheme) = next_char.unwrap(); + *pos = idx + next_grapheme.len(); + self.iter.next(); + continue; + } + RI => { + /* LB30a Break between two regional indicator symbols if and only if there are an + * even number of regional indicators preceding the position of the break. + * sot (RI RI)* RI × RI + * [^RI] (RI RI)* RI × RI */ + *reg_ind_streak += 1; + *pos += grapheme.len(); + if *reg_ind_streak % 2 == 1 { + return Some((*pos - grapheme.len(), BreakAllowed)); + } + self.iter.next(); + continue; + } + _ if next_char.is_none() => { + return None; + } + _ => { + *pos += grapheme.len(); + return Some((*pos - grapheme.len(), BreakAllowed)); + } } } }