From 5327dae02d65a9ce0bc27b986693952dbc87f4a6 Mon Sep 17 00:00:00 2001 From: Manos Pitsidianakis Date: Tue, 1 Dec 2020 00:42:30 +0200 Subject: [PATCH] melib/text_processing: add LineBreakText iterator A lazy stateful iterator for line breaking text. Useful for very long text where you don't want to linebreak it completely before user requests specific lines. --- melib/src/text_processing/line_break.rs | 469 ++++++++++++++++++++++++ 1 file changed, 469 insertions(+) diff --git a/melib/src/text_processing/line_break.rs b/melib/src/text_processing/line_break.rs index 75cd8fe64..ed052f2b3 100644 --- a/melib/src/text_processing/line_break.rs +++ b/melib/src/text_processing/line_break.rs @@ -28,6 +28,7 @@ use super::types::Reflow; use core::cmp::Ordering; use core::iter::Peekable; use core::str::FromStr; +use std::collections::VecDeque; use LineBreakClass::*; #[derive(Debug, PartialEq, Copy, Clone)] @@ -1327,3 +1328,471 @@ mod segment_tree { } } } + +/// A lazy stateful iterator for line breaking text. Useful for very long text where you don't want +/// to linebreak it completely before user requests specific lines. +#[derive(Debug, Clone)] +pub struct LineBreakText { + text: String, + reflow: Reflow, + paragraph: VecDeque, + paragraph_start_index: usize, + width: Option, + state: ReflowState, +} + +#[derive(Debug, Clone)] +enum ReflowState { + ReflowNo { + cur_index: usize, + }, + ReflowAllWidth { + width: usize, + state: LineBreakTextState, + }, + ReflowAll { + cur_index: usize, + }, + ReflowFormatFlowed { + cur_index: usize, + }, +} + +impl ReflowState { + fn new(reflow: Reflow, width: Option, cur_index: usize) -> ReflowState { + match reflow { + Reflow::All if width.is_some() => ReflowState::ReflowAllWidth { + width: width.unwrap(), + state: LineBreakTextState::AtLine { cur_index }, + }, + Reflow::All => ReflowState::ReflowAll { cur_index }, + Reflow::FormatFlowed => ReflowState::ReflowFormatFlowed { cur_index }, + Reflow::No => ReflowState::ReflowNo { cur_index }, + } + } +} + +#[derive(Debug, Clone)] +enum LineBreakTextState { + AtLine { + cur_index: usize, + }, + WithinLine { + line_index: usize, + line_length: usize, + within_line_index: usize, + breaks: Vec<(usize, LineBreakCandidate)>, + prev_break: usize, + segment_tree: segment_tree::SegmentTree, + }, +} + +impl Default for LineBreakText { + fn default() -> Self { + Self::new(String::new(), Reflow::default(), None) + } +} + +impl LineBreakText { + pub fn new(text: String, reflow: Reflow, width: Option) -> Self { + LineBreakText { + text, + state: ReflowState::new(reflow, width, 0), + paragraph: VecDeque::new(), + paragraph_start_index: 0, + reflow, + width, + } + } + + pub fn width(&self) -> Option { + self.width + } + + pub fn set_reflow(&mut self, new_val: Reflow) -> &mut Self { + self.reflow = new_val; + self.paragraph.clear(); + self.state = ReflowState::new(self.reflow, self.width, self.paragraph_start_index); + self + } + + pub fn set_width(&mut self, new_val: Option) -> &mut Self { + self.width = new_val; + self.paragraph.clear(); + self.state = ReflowState::new(self.reflow, self.width, self.paragraph_start_index); + self + } + + pub fn set_text(&mut self, new_val: String) -> &mut Self { + self.text = new_val; + self.reset() + } + + pub fn reset(&mut self) -> &mut Self { + self.paragraph.clear(); + self.state = ReflowState::new(self.reflow, self.width, 0); + self.paragraph_start_index = 0; + self + } + + pub fn is_finished(&self) -> bool { + match self.state { + ReflowState::ReflowNo { cur_index } + | ReflowState::ReflowAll { cur_index } + | ReflowState::ReflowFormatFlowed { cur_index } + | ReflowState::ReflowAllWidth { + width: _, + state: LineBreakTextState::AtLine { cur_index }, + } => cur_index >= self.text.len(), + ReflowState::ReflowAllWidth { + width: _, + state: LineBreakTextState::WithinLine { .. }, + } => false, + } + } +} + +impl Iterator for LineBreakText { + type Item = String; + fn next(&mut self) -> Option { + if !self.paragraph.is_empty() { + return self.paragraph.pop_front(); + } + if self.is_finished() { + return None; + } + match self.state { + ReflowState::ReflowFormatFlowed { ref mut cur_index } => { + /* rfc3676 - The Text/Plain Format and DelSp Parameters + * https://tools.ietf.org/html/rfc3676 */ + + /* + * - Split lines with indices using str::match_indices() + * - Iterate and reflow flow regions, and pass fixed regions through + */ + self.paragraph_start_index = *cur_index; + let line_indices_iter = self.text[*cur_index..].match_indices('\n').map(|(i, _)| i); + let start_offset = *cur_index; + let mut prev_index = *cur_index; + let mut in_paragraph = false; + let mut paragraph_start = *cur_index; + + let mut prev_quote_depth = 0; + let mut paragraph = VecDeque::new(); + for i in line_indices_iter { + let i = i + start_offset + 1; + let line = &self.text[prev_index..i]; + let mut trimmed = line.trim_start().lines().next().unwrap_or(""); + let mut quote_depth = 0; + let p_str: usize = trimmed + .as_bytes() + .iter() + .position(|&b| { + if b != b'>' { + /* position() is short-circuiting */ + true + } else { + quote_depth += 1; + false + } + }) + .unwrap_or(0); + trimmed = &trimmed[p_str..]; + if trimmed.starts_with(' ') { + /* Remove space stuffing before checking for ending space character. + * [rfc3676#section-4.4] */ + trimmed = &trimmed[1..]; + } + + if trimmed.ends_with(' ') { + if !in_paragraph { + in_paragraph = true; + paragraph_start = prev_index; + } else if prev_quote_depth == quote_depth { + /* This becomes part of the paragraph we're in */ + } else { + /*Malformed line, different quote depths can't be in the same paragraph. */ + let paragraph_s = &self.text[paragraph_start..prev_index]; + reflow_helper2( + &mut paragraph, + paragraph_s, + prev_quote_depth, + in_paragraph, + self.width, + ); + + paragraph_start = prev_index; + } + } else { + if prev_quote_depth == quote_depth || !in_paragraph { + let paragraph_s = &self.text[paragraph_start..i]; + reflow_helper2( + &mut paragraph, + paragraph_s, + quote_depth, + in_paragraph, + self.width, + ); + } else { + /*Malformed line, different quote depths can't be in the same paragraph. */ + let paragraph_s = &self.text[paragraph_start..prev_index]; + reflow_helper2( + &mut paragraph, + paragraph_s, + prev_quote_depth, + in_paragraph, + self.width, + ); + let paragraph_s = &self.text[prev_index..i]; + reflow_helper2( + &mut paragraph, + paragraph_s, + quote_depth, + false, + self.width, + ); + } + *cur_index = i; + std::mem::swap(&mut self.paragraph, &mut paragraph); + paragraph_start = i; + in_paragraph = false; + break; + } + *cur_index = i; + prev_quote_depth = quote_depth; + prev_index = i; + } + if in_paragraph { + let paragraph_s = &self.text[paragraph_start..self.text.len()]; + *cur_index = self.text.len(); + reflow_helper2( + &mut paragraph, + paragraph_s, + prev_quote_depth, + in_paragraph, + self.width, + ); + self.paragraph = paragraph; + } + return self.paragraph.pop_front(); + } + ReflowState::ReflowAllWidth { + width, + ref mut state, + } => { + let width = width.saturating_sub(2); + + loop { + let line: &str; + let cur_index: &mut usize; + let within_line_index: &mut usize; + let prev_break: &mut usize; + let segment_tree: &segment_tree::SegmentTree; + let breaks: &Vec<(usize, LineBreakCandidate)>; + match state { + LineBreakTextState::AtLine { + cur_index: ref mut _cur_index, + } => { + line = if let Some(line) = self + .text + .get(*_cur_index..) + .and_then(|slice| slice.split('\n').next()) + { + line + } else { + *_cur_index = self.text.len(); + return None; + }; + let _cur_index = *_cur_index; + *state = LineBreakTextState::WithinLine { + line_index: _cur_index, + line_length: line.len(), + within_line_index: 0, + breaks: LineBreakCandidateIter::new(line).collect::>( + ), + prev_break: 0, + segment_tree: { + use std::iter::FromIterator; + let mut t: smallvec::SmallVec<[usize; 1024]> = + smallvec::SmallVec::from_iter( + std::iter::repeat(0).take(line.len()), + ); + for (idx, _g) in + UnicodeSegmentation::grapheme_indices(line, true) + { + t[idx] = 1; + } + segment_tree::SegmentTree::new(t) + }, + }; + if let LineBreakTextState::WithinLine { + ref mut line_index, + line_length: _, + within_line_index: ref mut _within_line_index, + breaks: ref _breaks, + prev_break: ref mut _prev_break, + segment_tree: ref _segment_tree, + } = state + { + cur_index = line_index; + within_line_index = _within_line_index; + breaks = _breaks; + prev_break = _prev_break; + + segment_tree = _segment_tree; + } else { + unreachable!() + } + } + LineBreakTextState::WithinLine { + ref mut line_index, + ref line_length, + within_line_index: ref mut _within_line_index, + breaks: ref _breaks, + prev_break: ref mut _prev_break, + segment_tree: ref _segment_tree, + } => { + line = &self.text[*line_index..(*line_index + *line_length)]; + cur_index = line_index; + within_line_index = _within_line_index; + breaks = _breaks; + prev_break = _prev_break; + segment_tree = _segment_tree; + } + } + + if segment_tree.get_sum(0, line.len()) <= width { + *state = LineBreakTextState::AtLine { + cur_index: *cur_index + line.len() + 1, + }; + return Some( + line.trim_end_matches(|c| c == '\r' || c == '\n') + .to_string(), + ); + } + if breaks.len() < 2 { + let mut line = line; + while !line.is_empty() { + let mut chop_index = std::cmp::min(line.len().saturating_sub(1), width); + while chop_index > 0 && !line.is_char_boundary(chop_index) { + chop_index -= 1; + } + if chop_index == 0 { + self.paragraph.push_back(format!("⤷{}", line)); + *cur_index += line.len(); + break; + } else { + self.paragraph + .push_back(format!("⤷{}", &line[..chop_index])); + *cur_index += chop_index; + } + line = &line[chop_index..]; + } + *state = LineBreakTextState::AtLine { + cur_index: *cur_index, + }; + if !self.paragraph.is_empty() { + return self.paragraph.pop_front(); + } + continue; + } + + while *prev_break < breaks.len() { + let new_off = match breaks[*prev_break..].binary_search_by(|(offset, _)| { + segment_tree + .get_sum(*within_line_index, offset.saturating_sub(1)) + .cmp(&width) + }) { + Ok(v) => v, + Err(v) => v, + } + *prev_break; + let end_offset = if new_off >= breaks.len() { + line.len() + } else { + breaks[new_off].0 + }; + if !line[*within_line_index..end_offset].is_empty() { + if *within_line_index == 0 { + let ret = line[*within_line_index..end_offset] + .trim_end_matches(|c| c == '\r' || c == '\n'); + *within_line_index = end_offset; + return Some(ret.to_string()); + } else { + let ret = format!( + "⤷{}", + &line[*within_line_index..end_offset] + .trim_end_matches(|c| c == '\r' || c == '\n') + ); + *within_line_index = end_offset; + return Some(ret); + } + } + if *within_line_index == end_offset && *prev_break == new_off { + break; + } + *within_line_index = end_offset + 1; + *prev_break = new_off; + } + *state = LineBreakTextState::AtLine { + cur_index: *cur_index + line.len() + 1, + }; + } + } + ReflowState::ReflowNo { ref mut cur_index } + | ReflowState::ReflowAll { ref mut cur_index } => { + for line in self.text[*cur_index..].split('\n') { + let ret = line.to_string(); + *cur_index += line.len() + 2; + return Some(ret); + } + return None; + } + } + } +} + +fn reflow_helper2( + ret: &mut VecDeque, + paragraph: &str, + quote_depth: usize, + in_paragraph: bool, + width: Option, +) { + if quote_depth > 0 { + let quotes: String = ">".repeat(quote_depth); + let paragraph = paragraph + .trim_start_matches("es) + .replace(&format!("\n{}", "es), "") + .replace("\n", "") + .replace("\r", ""); + if in_paragraph { + if let Some(width) = width { + ret.extend( + linear(¶graph, width.saturating_sub(quote_depth)) + .into_iter() + .map(|l| format!("{}{}", "es, l)), + ); + } else { + ret.push_back(format!("{}{}", "es, ¶graph)); + } + } else { + ret.push_back(format!("{}{}", "es, ¶graph)); + } + } else { + let paragraph = paragraph.replace("\n", "").replace("\r", ""); + + if in_paragraph { + if let Some(width) = width { + let ex = linear(¶graph, width); + ret.extend(ex.into_iter()); + } else { + ret.push_back(paragraph); + } + } else { + ret.push_back(paragraph); + } + } +}