LineBreakCandidateIter: make iter non-recursive

A line with lots of graphemes without any breaks can overflow the stack,
so make the recursion into a loop.
async
Manos Pitsidianakis 2020-05-31 01:08:22 +03:00
parent 6ceed3cae9
commit b3b9563db0
Signed by: Manos Pitsidianakis
GPG Key ID: 73627C2F690DF710
1 changed files with 567 additions and 564 deletions

View File

@ -130,6 +130,7 @@ macro_rules! next_grapheme_class {
impl<'a> Iterator for LineBreakCandidateIter<'a> {
type Item = (usize, LineBreakCandidate);
fn next(&mut self) -> Option<Self::Item> {
loop {
// After end of text, there are no breaks.
if self.pos >= self.text.len() {
return None;
@ -140,13 +141,13 @@ impl<'a> Iterator for LineBreakCandidateIter<'a> {
return Some((self.pos, MandatoryBreak));
}
let (idx, mut grapheme) = self.iter.next().unwrap();
let LineBreakCandidateIter {
ref mut iter,
ref text,
ref mut reg_ind_streak,
ref mut pos,
} = self;
let (idx, mut grapheme) = iter.next().unwrap();
let iter = iter.by_ref();
debug_assert_eq!(idx, *pos);
@ -154,7 +155,7 @@ impl<'a> Iterator for LineBreakCandidateIter<'a> {
// LB2 Never break at the start of text
if idx == 0 {
*pos += grapheme.len();
return self.next();
continue;
}
let class = get_class!(grapheme);
@ -206,13 +207,13 @@ impl<'a> Iterator for LineBreakCandidateIter<'a> {
/* LB6 Do not break before hard line breaks. × ( BK | CR | LF | NL ) */
BK | CR | LF | NL => {
*pos += grapheme.len();
return self.next();
continue;
}
/* LB7 Do not break before spaces or zero width
* space. × SP × ZW */
SP | ZW => {
*pos += grapheme.len();
return self.next();
continue;
}
_ => {}
}
@ -231,7 +232,7 @@ impl<'a> Iterator for LineBreakCandidateIter<'a> {
ZWJ => {
// LB8a Do not break after a zero width joiner.
*pos += grapheme.len();
return self.next();
continue;
}
CM => {
@ -242,7 +243,7 @@ impl<'a> Iterator for LineBreakCandidateIter<'a> {
// where X is any line break class except BK, CR, LF, NL, SP, or ZW.
*pos += grapheme.len();
return self.next();
continue;
}
WJ => {
/*: LB11 Do not break before or after Word joiner and related characters.*/
@ -251,12 +252,12 @@ impl<'a> Iterator for LineBreakCandidateIter<'a> {
if next_grapheme_class!(iter, grapheme).is_some() {
*pos += grapheme.len();
}
return self.next();
continue;
}
GL => {
/*LB12 Non-breaking characters: LB12 Do not break after NBSP and related characters.*/
*pos += grapheme.len();
return self.next();
continue;
}
_ => {}
}
@ -268,12 +269,12 @@ impl<'a> Iterator for LineBreakCandidateIter<'a> {
* hyphens. [^SP BA HY] × GL
* Also LB12 Do not break after NBSP and related characters */
*pos += grapheme.len();
return self.next();
continue;
}
/* LB13 Do not break before ] or ! or ; or /, even after spaces. */
CL | CP | EX | IS | SY => {
*pos = *next_idx;
return self.next();
continue;
}
_ => {}
}
@ -283,12 +284,13 @@ impl<'a> Iterator for LineBreakCandidateIter<'a> {
/* LB13 Do not break before ] or ! or ; or /, even after spaces. */
SP if [CL, CP, EX, IS, SY].contains(&get_class!(text[idx..].trim_start())) => {
*pos += grapheme.len();
while ![CL, CP, EX, IS, SY].contains(&next_grapheme_class!(iter, grapheme).unwrap())
while ![CL, CP, EX, IS, SY]
.contains(&next_grapheme_class!(iter, grapheme).unwrap())
{
*pos += grapheme.len();
}
*pos += grapheme.len();
return self.next();
continue;
}
OP => {
/* LB14 Do not break after [, even after spaces.
@ -300,7 +302,7 @@ impl<'a> Iterator for LineBreakCandidateIter<'a> {
break;
}
}
return self.next();
continue;
}
QU if get_class!(text[idx..].trim_start()) == OP => {
/* LB15 Do not break within ‘”[, even with intervening spaces.
@ -310,7 +312,7 @@ impl<'a> Iterator for LineBreakCandidateIter<'a> {
*pos += grapheme.len();
}
*pos = idx;
return self.next();
continue;
}
QU => {
/* LB19 Do not break before or after quotation marks, such as . */
@ -318,7 +320,7 @@ impl<'a> Iterator for LineBreakCandidateIter<'a> {
if let Some((_, g)) = self.iter.next() {
*pos += g.len();
}
return self.next();
continue;
}
LineBreakClass::CL | LineBreakClass::CP
if get_class!(text[idx..].trim_start()) == NS =>
@ -330,14 +332,14 @@ impl<'a> Iterator for LineBreakCandidateIter<'a> {
while Some(SP) == next_grapheme_class!(iter, grapheme) {
*pos += grapheme.len();
}
return self.next();
continue;
}
B2 if get_class!(text[idx..].trim_start()) == B2 => {
*pos += grapheme.len();
while Some(SP) == next_grapheme_class!(iter, grapheme) {
*pos += grapheme.len();
}
return self.next();
continue;
}
SP => {
/* LB18 Break after spaces. SP ÷ */
@ -354,7 +356,7 @@ impl<'a> Iterator for LineBreakCandidateIter<'a> {
/* LB19 Do not break before or after quotation marks, such as . */
*pos = *next_idx + next_grapheme.len();
self.iter.next();
return self.next();
continue;
}
_ => {}
}
@ -369,7 +371,7 @@ impl<'a> Iterator for LineBreakCandidateIter<'a> {
* kana, and other non-starters, or after acute accents. × BA, × HY, × NS, BB × */
BB => {
*pos += grapheme.len();
return self.next();
continue;
}
_ => {}
}
@ -381,7 +383,7 @@ impl<'a> Iterator for LineBreakCandidateIter<'a> {
/* LB21 Do not break before hyphen-minus, other hyphens, fixed-width spaces, small
* kana, and other non-starters, or after acute accents. × BA, × HY, × NS, BB × */
*pos += grapheme.len();
return self.next();
continue;
}
_ => {}
}
@ -392,7 +394,7 @@ impl<'a> Iterator for LineBreakCandidateIter<'a> {
let (idx, next_grapheme) = next_char.unwrap();
*pos = idx + next_grapheme.len();
self.iter.next();
return self.next();
continue;
}
/* LB21b Dont break between ,Solidus and Hebrew letters. SY × HL */
SY if next_grapheme_class!((next_char is HL)) => {
@ -403,7 +405,7 @@ impl<'a> Iterator for LineBreakCandidateIter<'a> {
if let Some((idx, next_grapheme)) = self.iter.next() {
*pos = idx + next_grapheme.len();
}
return self.next();
continue;
}
/* LB22 Do not break between two ellipses, or between letters, numbers or excla-
* mations and ellipsis.
@ -413,40 +415,40 @@ impl<'a> Iterator for LineBreakCandidateIter<'a> {
let (idx, next_grapheme) = next_char.unwrap();
*pos = idx + next_grapheme.len();
self.iter.next();
return self.next();
continue;
}
/* EX × IN */
EX if next_grapheme_class!((next_char is IN)) => {
let (idx, next_grapheme) = next_char.unwrap();
*pos = idx + next_grapheme.len();
self.iter.next();
return self.next();
continue;
}
EX => {
// LB13
*pos += grapheme.len();
return self.next();
continue;
}
/* (ID | EB | EM) × IN */
ID | EB | EM if next_grapheme_class!((next_char is IN)) => {
let (idx, next_grapheme) = next_char.unwrap();
*pos = idx + next_grapheme.len();
self.iter.next();
return self.next();
continue;
}
/* IN × IN */
IN if next_grapheme_class!((next_char is IN)) => {
let (idx, next_grapheme) = next_char.unwrap();
*pos = idx + next_grapheme.len();
self.iter.next();
return self.next();
continue;
}
/* NU × IN */
NU if next_grapheme_class!((next_char is IN)) => {
let (idx, next_grapheme) = next_char.unwrap();
*pos = idx + next_grapheme.len();
self.iter.next();
return self.next();
continue;
}
/* LB23 Do not break between digits and letters.
* (AL | HL) × NU */
@ -454,14 +456,14 @@ impl<'a> Iterator for LineBreakCandidateIter<'a> {
let (idx, next_grapheme) = next_char.unwrap();
*pos = idx + next_grapheme.len();
self.iter.next();
return self.next();
continue;
}
/* NU × (AL | HL) */
NU if next_grapheme_class!((next_char is AL, HL)) => {
let (idx, next_grapheme) = next_char.unwrap();
*pos = idx + next_grapheme.len();
self.iter.next();
return self.next();
continue;
}
/* LB23a Do not break between numeric prefixes and ideographs, or between ideographs
* and numeric postfixes.
@ -470,14 +472,14 @@ impl<'a> Iterator for LineBreakCandidateIter<'a> {
let (idx, next_grapheme) = next_char.unwrap();
*pos = idx + next_grapheme.len();
self.iter.next();
return self.next();
continue;
}
/* (ID | EB | EM) × PO */
ID | EB | EM if next_grapheme_class!((next_char is PO)) => {
let (idx, next_grapheme) = next_char.unwrap();
*pos = idx + next_grapheme.len();
self.iter.next();
return self.next();
continue;
}
/* B24 Do not break between numeric prefix/postfix and letters, or between
letters and prefix/postfix.
@ -486,14 +488,14 @@ impl<'a> Iterator for LineBreakCandidateIter<'a> {
let (idx, next_grapheme) = next_char.unwrap();
*pos = idx + next_grapheme.len();
self.iter.next();
return self.next();
continue;
}
/*(AL | HL) × (PR | PO) */
AL | HL if next_grapheme_class!((next_char is PR, PO)) => {
let (idx, next_grapheme) = next_char.unwrap();
*pos = idx + next_grapheme.len();
self.iter.next();
return self.next();
continue;
}
/* LB25 Do not break between the following pairs of classes relevant to numbers:
* CL × PO */
@ -501,98 +503,98 @@ impl<'a> Iterator for LineBreakCandidateIter<'a> {
let (idx, next_grapheme) = next_char.unwrap();
*pos = idx + next_grapheme.len();
self.iter.next();
return self.next();
continue;
}
/* CP × PO */
CP if next_grapheme_class!((next_char is PO)) => {
let (idx, next_grapheme) = next_char.unwrap();
*pos = idx + next_grapheme.len();
self.iter.next();
return self.next();
continue;
}
/* CL × PR */
CL if next_grapheme_class!((next_char is PR)) => {
let (idx, next_grapheme) = next_char.unwrap();
*pos = idx + next_grapheme.len();
self.iter.next();
return self.next();
continue;
}
/* CP × PR */
CP if next_grapheme_class!((next_char is PR)) => {
let (idx, next_grapheme) = next_char.unwrap();
*pos = idx + next_grapheme.len();
self.iter.next();
return self.next();
continue;
}
/* NU × PO */
NU if next_grapheme_class!((next_char is PO)) => {
let (idx, next_grapheme) = next_char.unwrap();
*pos = idx + next_grapheme.len();
self.iter.next();
return self.next();
continue;
}
/* NU × PR */
NU if next_grapheme_class!((next_char is PR)) => {
let (idx, next_grapheme) = next_char.unwrap();
*pos = idx + next_grapheme.len();
self.iter.next();
return self.next();
continue;
}
/* PO × OP */
PO if next_grapheme_class!((next_char is OP)) => {
let (idx, next_grapheme) = next_char.unwrap();
*pos = idx + next_grapheme.len();
self.iter.next();
return self.next();
continue;
}
/* PO × NU */
PO if next_grapheme_class!((next_char is NU)) => {
let (idx, next_grapheme) = next_char.unwrap();
*pos = idx + next_grapheme.len();
self.iter.next();
return self.next();
continue;
}
/* PR × OP */
PR if next_grapheme_class!((next_char is OP)) => {
let (idx, next_grapheme) = next_char.unwrap();
*pos = idx + next_grapheme.len();
self.iter.next();
return self.next();
continue;
}
/* PR × NU */
PR if next_grapheme_class!((next_char is NU)) => {
let (idx, next_grapheme) = next_char.unwrap();
*pos = idx + next_grapheme.len();
self.iter.next();
return self.next();
continue;
}
/* HY × NU */
HY if next_grapheme_class!((next_char is NU)) => {
let (idx, next_grapheme) = next_char.unwrap();
*pos = idx + next_grapheme.len();
self.iter.next();
return self.next();
continue;
}
/* IS × NU */
IS if next_grapheme_class!((next_char is NU)) => {
let (idx, next_grapheme) = next_char.unwrap();
*pos = idx + next_grapheme.len();
self.iter.next();
return self.next();
continue;
}
/* NU × NU */
NU if next_grapheme_class!((next_char is NU)) => {
let (idx, next_grapheme) = next_char.unwrap();
*pos = idx + next_grapheme.len();
self.iter.next();
return self.next();
continue;
}
/* SY × NU */
SY if next_grapheme_class!((next_char is NU)) => {
let (idx, next_grapheme) = next_char.unwrap();
*pos = idx + next_grapheme.len();
self.iter.next();
return self.next();
continue;
}
/* LB26 Do not break a Korean syllable.
* JL × (JL | JV | H2 | H3) */
@ -600,21 +602,21 @@ impl<'a> Iterator for LineBreakCandidateIter<'a> {
let (idx, next_grapheme) = next_char.unwrap();
*pos = idx + next_grapheme.len();
self.iter.next();
return self.next();
continue;
}
/* (JV | H2) × (JV | JT) */
JV | H2 if next_grapheme_class!((next_char is JV, JT)) => {
let (idx, next_grapheme) = next_char.unwrap();
*pos = idx + next_grapheme.len();
self.iter.next();
return self.next();
continue;
}
/* (JT | H3) × JT */
JT | H3 if next_grapheme_class!((next_char is JT)) => {
let (idx, next_grapheme) = next_char.unwrap();
*pos = idx + next_grapheme.len();
self.iter.next();
return self.next();
continue;
}
/* LB27 Treat a Korean Syllable Block the same as ID.
* (JL | JV | JT | H2 | H3) × IN */
@ -622,21 +624,21 @@ impl<'a> Iterator for LineBreakCandidateIter<'a> {
let (idx, next_grapheme) = next_char.unwrap();
*pos = idx + next_grapheme.len();
self.iter.next();
return self.next();
continue;
}
/* (JL | JV | JT | H2 | H3) × PO */
JL | JV | JT | H2 | H3 if next_grapheme_class!((next_char is PO)) => {
let (idx, next_grapheme) = next_char.unwrap();
*pos = idx + next_grapheme.len();
self.iter.next();
return self.next();
continue;
}
/* PR × (JL | JV | JT | H2 | H3) */
PR if next_grapheme_class!((next_char is JL, JV, JT, H2, H3)) => {
let (idx, next_grapheme) = next_char.unwrap();
*pos = idx + next_grapheme.len();
self.iter.next();
return self.next();
continue;
}
/* LB28 Do not break between alphabetics (“at”).
(AL | HL) × (AL | HL) */
@ -644,7 +646,7 @@ impl<'a> Iterator for LineBreakCandidateIter<'a> {
let (idx, next_grapheme) = next_char.unwrap();
*pos = idx + next_grapheme.len();
self.iter.next();
return self.next();
continue;
}
/* LB29 Do not break between numeric punctuation and alphabetics (“e.g.”).
IS × (AL | HL) */
@ -652,7 +654,7 @@ impl<'a> Iterator for LineBreakCandidateIter<'a> {
let (idx, next_grapheme) = next_char.unwrap();
*pos = idx + next_grapheme.len();
self.iter.next();
return self.next();
continue;
}
/* LB30 Do not break between letters, numbers, or ordinary symbols and opening
or closing parentheses.
@ -661,14 +663,14 @@ impl<'a> Iterator for LineBreakCandidateIter<'a> {
let (idx, next_grapheme) = next_char.unwrap();
*pos = idx + next_grapheme.len();
self.iter.next();
return self.next();
continue;
}
/* CP × (AL | HL | NU) */
CP if next_grapheme_class!((next_char is AL, HL , NU)) => {
let (idx, next_grapheme) = next_char.unwrap();
*pos = idx + next_grapheme.len();
self.iter.next();
return self.next();
continue;
}
/*LB30b Do not break between an emoji base and an emoji modifier.
* EB × EM */
@ -676,7 +678,7 @@ impl<'a> Iterator for LineBreakCandidateIter<'a> {
let (idx, next_grapheme) = next_char.unwrap();
*pos = idx + next_grapheme.len();
self.iter.next();
return self.next();
continue;
}
RI => {
/* LB30a Break between two regional indicator symbols if and only if there are an
@ -689,7 +691,7 @@ impl<'a> Iterator for LineBreakCandidateIter<'a> {
return Some((*pos - grapheme.len(), BreakAllowed));
}
self.iter.next();
return self.next();
continue;
}
_ if next_char.is_none() => {
return None;
@ -701,6 +703,7 @@ impl<'a> Iterator for LineBreakCandidateIter<'a> {
}
}
}
}
fn search_table(c: u32, t: &'static [(u32, u32, LineBreakClass)]) -> LineBreakClass {
match t.binary_search_by(|&(lo, hi, _)| {