melib/text_processing: add LineBreakText iterator
A lazy stateful iterator for line breaking text. Useful for very long text where you don't want to linebreak it completely before user requests specific lines.jmap-eventsource
parent
c990687e5f
commit
5327dae02d
|
@ -28,6 +28,7 @@ use super::types::Reflow;
|
|||
use core::cmp::Ordering;
|
||||
use core::iter::Peekable;
|
||||
use core::str::FromStr;
|
||||
use std::collections::VecDeque;
|
||||
use LineBreakClass::*;
|
||||
|
||||
#[derive(Debug, PartialEq, Copy, Clone)]
|
||||
|
@ -1327,3 +1328,471 @@ mod segment_tree {
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// A lazy stateful iterator for line breaking text. Useful for very long text where you don't want
|
||||
/// to linebreak it completely before user requests specific lines.
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct LineBreakText {
|
||||
text: String,
|
||||
reflow: Reflow,
|
||||
paragraph: VecDeque<String>,
|
||||
paragraph_start_index: usize,
|
||||
width: Option<usize>,
|
||||
state: ReflowState,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
enum ReflowState {
|
||||
ReflowNo {
|
||||
cur_index: usize,
|
||||
},
|
||||
ReflowAllWidth {
|
||||
width: usize,
|
||||
state: LineBreakTextState,
|
||||
},
|
||||
ReflowAll {
|
||||
cur_index: usize,
|
||||
},
|
||||
ReflowFormatFlowed {
|
||||
cur_index: usize,
|
||||
},
|
||||
}
|
||||
|
||||
impl ReflowState {
|
||||
fn new(reflow: Reflow, width: Option<usize>, cur_index: usize) -> ReflowState {
|
||||
match reflow {
|
||||
Reflow::All if width.is_some() => ReflowState::ReflowAllWidth {
|
||||
width: width.unwrap(),
|
||||
state: LineBreakTextState::AtLine { cur_index },
|
||||
},
|
||||
Reflow::All => ReflowState::ReflowAll { cur_index },
|
||||
Reflow::FormatFlowed => ReflowState::ReflowFormatFlowed { cur_index },
|
||||
Reflow::No => ReflowState::ReflowNo { cur_index },
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
enum LineBreakTextState {
|
||||
AtLine {
|
||||
cur_index: usize,
|
||||
},
|
||||
WithinLine {
|
||||
line_index: usize,
|
||||
line_length: usize,
|
||||
within_line_index: usize,
|
||||
breaks: Vec<(usize, LineBreakCandidate)>,
|
||||
prev_break: usize,
|
||||
segment_tree: segment_tree::SegmentTree,
|
||||
},
|
||||
}
|
||||
|
||||
impl Default for LineBreakText {
|
||||
fn default() -> Self {
|
||||
Self::new(String::new(), Reflow::default(), None)
|
||||
}
|
||||
}
|
||||
|
||||
impl LineBreakText {
|
||||
pub fn new(text: String, reflow: Reflow, width: Option<usize>) -> Self {
|
||||
LineBreakText {
|
||||
text,
|
||||
state: ReflowState::new(reflow, width, 0),
|
||||
paragraph: VecDeque::new(),
|
||||
paragraph_start_index: 0,
|
||||
reflow,
|
||||
width,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn width(&self) -> Option<usize> {
|
||||
self.width
|
||||
}
|
||||
|
||||
pub fn set_reflow(&mut self, new_val: Reflow) -> &mut Self {
|
||||
self.reflow = new_val;
|
||||
self.paragraph.clear();
|
||||
self.state = ReflowState::new(self.reflow, self.width, self.paragraph_start_index);
|
||||
self
|
||||
}
|
||||
|
||||
pub fn set_width(&mut self, new_val: Option<usize>) -> &mut Self {
|
||||
self.width = new_val;
|
||||
self.paragraph.clear();
|
||||
self.state = ReflowState::new(self.reflow, self.width, self.paragraph_start_index);
|
||||
self
|
||||
}
|
||||
|
||||
pub fn set_text(&mut self, new_val: String) -> &mut Self {
|
||||
self.text = new_val;
|
||||
self.reset()
|
||||
}
|
||||
|
||||
pub fn reset(&mut self) -> &mut Self {
|
||||
self.paragraph.clear();
|
||||
self.state = ReflowState::new(self.reflow, self.width, 0);
|
||||
self.paragraph_start_index = 0;
|
||||
self
|
||||
}
|
||||
|
||||
pub fn is_finished(&self) -> bool {
|
||||
match self.state {
|
||||
ReflowState::ReflowNo { cur_index }
|
||||
| ReflowState::ReflowAll { cur_index }
|
||||
| ReflowState::ReflowFormatFlowed { cur_index }
|
||||
| ReflowState::ReflowAllWidth {
|
||||
width: _,
|
||||
state: LineBreakTextState::AtLine { cur_index },
|
||||
} => cur_index >= self.text.len(),
|
||||
ReflowState::ReflowAllWidth {
|
||||
width: _,
|
||||
state: LineBreakTextState::WithinLine { .. },
|
||||
} => false,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Iterator for LineBreakText {
|
||||
type Item = String;
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
if !self.paragraph.is_empty() {
|
||||
return self.paragraph.pop_front();
|
||||
}
|
||||
if self.is_finished() {
|
||||
return None;
|
||||
}
|
||||
match self.state {
|
||||
ReflowState::ReflowFormatFlowed { ref mut cur_index } => {
|
||||
/* rfc3676 - The Text/Plain Format and DelSp Parameters
|
||||
* https://tools.ietf.org/html/rfc3676 */
|
||||
|
||||
/*
|
||||
* - Split lines with indices using str::match_indices()
|
||||
* - Iterate and reflow flow regions, and pass fixed regions through
|
||||
*/
|
||||
self.paragraph_start_index = *cur_index;
|
||||
let line_indices_iter = self.text[*cur_index..].match_indices('\n').map(|(i, _)| i);
|
||||
let start_offset = *cur_index;
|
||||
let mut prev_index = *cur_index;
|
||||
let mut in_paragraph = false;
|
||||
let mut paragraph_start = *cur_index;
|
||||
|
||||
let mut prev_quote_depth = 0;
|
||||
let mut paragraph = VecDeque::new();
|
||||
for i in line_indices_iter {
|
||||
let i = i + start_offset + 1;
|
||||
let line = &self.text[prev_index..i];
|
||||
let mut trimmed = line.trim_start().lines().next().unwrap_or("");
|
||||
let mut quote_depth = 0;
|
||||
let p_str: usize = trimmed
|
||||
.as_bytes()
|
||||
.iter()
|
||||
.position(|&b| {
|
||||
if b != b'>' {
|
||||
/* position() is short-circuiting */
|
||||
true
|
||||
} else {
|
||||
quote_depth += 1;
|
||||
false
|
||||
}
|
||||
})
|
||||
.unwrap_or(0);
|
||||
trimmed = &trimmed[p_str..];
|
||||
if trimmed.starts_with(' ') {
|
||||
/* Remove space stuffing before checking for ending space character.
|
||||
* [rfc3676#section-4.4] */
|
||||
trimmed = &trimmed[1..];
|
||||
}
|
||||
|
||||
if trimmed.ends_with(' ') {
|
||||
if !in_paragraph {
|
||||
in_paragraph = true;
|
||||
paragraph_start = prev_index;
|
||||
} else if prev_quote_depth == quote_depth {
|
||||
/* This becomes part of the paragraph we're in */
|
||||
} else {
|
||||
/*Malformed line, different quote depths can't be in the same paragraph. */
|
||||
let paragraph_s = &self.text[paragraph_start..prev_index];
|
||||
reflow_helper2(
|
||||
&mut paragraph,
|
||||
paragraph_s,
|
||||
prev_quote_depth,
|
||||
in_paragraph,
|
||||
self.width,
|
||||
);
|
||||
|
||||
paragraph_start = prev_index;
|
||||
}
|
||||
} else {
|
||||
if prev_quote_depth == quote_depth || !in_paragraph {
|
||||
let paragraph_s = &self.text[paragraph_start..i];
|
||||
reflow_helper2(
|
||||
&mut paragraph,
|
||||
paragraph_s,
|
||||
quote_depth,
|
||||
in_paragraph,
|
||||
self.width,
|
||||
);
|
||||
} else {
|
||||
/*Malformed line, different quote depths can't be in the same paragraph. */
|
||||
let paragraph_s = &self.text[paragraph_start..prev_index];
|
||||
reflow_helper2(
|
||||
&mut paragraph,
|
||||
paragraph_s,
|
||||
prev_quote_depth,
|
||||
in_paragraph,
|
||||
self.width,
|
||||
);
|
||||
let paragraph_s = &self.text[prev_index..i];
|
||||
reflow_helper2(
|
||||
&mut paragraph,
|
||||
paragraph_s,
|
||||
quote_depth,
|
||||
false,
|
||||
self.width,
|
||||
);
|
||||
}
|
||||
*cur_index = i;
|
||||
std::mem::swap(&mut self.paragraph, &mut paragraph);
|
||||
paragraph_start = i;
|
||||
in_paragraph = false;
|
||||
break;
|
||||
}
|
||||
*cur_index = i;
|
||||
prev_quote_depth = quote_depth;
|
||||
prev_index = i;
|
||||
}
|
||||
if in_paragraph {
|
||||
let paragraph_s = &self.text[paragraph_start..self.text.len()];
|
||||
*cur_index = self.text.len();
|
||||
reflow_helper2(
|
||||
&mut paragraph,
|
||||
paragraph_s,
|
||||
prev_quote_depth,
|
||||
in_paragraph,
|
||||
self.width,
|
||||
);
|
||||
self.paragraph = paragraph;
|
||||
}
|
||||
return self.paragraph.pop_front();
|
||||
}
|
||||
ReflowState::ReflowAllWidth {
|
||||
width,
|
||||
ref mut state,
|
||||
} => {
|
||||
let width = width.saturating_sub(2);
|
||||
|
||||
loop {
|
||||
let line: &str;
|
||||
let cur_index: &mut usize;
|
||||
let within_line_index: &mut usize;
|
||||
let prev_break: &mut usize;
|
||||
let segment_tree: &segment_tree::SegmentTree;
|
||||
let breaks: &Vec<(usize, LineBreakCandidate)>;
|
||||
match state {
|
||||
LineBreakTextState::AtLine {
|
||||
cur_index: ref mut _cur_index,
|
||||
} => {
|
||||
line = if let Some(line) = self
|
||||
.text
|
||||
.get(*_cur_index..)
|
||||
.and_then(|slice| slice.split('\n').next())
|
||||
{
|
||||
line
|
||||
} else {
|
||||
*_cur_index = self.text.len();
|
||||
return None;
|
||||
};
|
||||
let _cur_index = *_cur_index;
|
||||
*state = LineBreakTextState::WithinLine {
|
||||
line_index: _cur_index,
|
||||
line_length: line.len(),
|
||||
within_line_index: 0,
|
||||
breaks: LineBreakCandidateIter::new(line).collect::<Vec<(
|
||||
usize,
|
||||
LineBreakCandidate,
|
||||
)>>(
|
||||
),
|
||||
prev_break: 0,
|
||||
segment_tree: {
|
||||
use std::iter::FromIterator;
|
||||
let mut t: smallvec::SmallVec<[usize; 1024]> =
|
||||
smallvec::SmallVec::from_iter(
|
||||
std::iter::repeat(0).take(line.len()),
|
||||
);
|
||||
for (idx, _g) in
|
||||
UnicodeSegmentation::grapheme_indices(line, true)
|
||||
{
|
||||
t[idx] = 1;
|
||||
}
|
||||
segment_tree::SegmentTree::new(t)
|
||||
},
|
||||
};
|
||||
if let LineBreakTextState::WithinLine {
|
||||
ref mut line_index,
|
||||
line_length: _,
|
||||
within_line_index: ref mut _within_line_index,
|
||||
breaks: ref _breaks,
|
||||
prev_break: ref mut _prev_break,
|
||||
segment_tree: ref _segment_tree,
|
||||
} = state
|
||||
{
|
||||
cur_index = line_index;
|
||||
within_line_index = _within_line_index;
|
||||
breaks = _breaks;
|
||||
prev_break = _prev_break;
|
||||
|
||||
segment_tree = _segment_tree;
|
||||
} else {
|
||||
unreachable!()
|
||||
}
|
||||
}
|
||||
LineBreakTextState::WithinLine {
|
||||
ref mut line_index,
|
||||
ref line_length,
|
||||
within_line_index: ref mut _within_line_index,
|
||||
breaks: ref _breaks,
|
||||
prev_break: ref mut _prev_break,
|
||||
segment_tree: ref _segment_tree,
|
||||
} => {
|
||||
line = &self.text[*line_index..(*line_index + *line_length)];
|
||||
cur_index = line_index;
|
||||
within_line_index = _within_line_index;
|
||||
breaks = _breaks;
|
||||
prev_break = _prev_break;
|
||||
segment_tree = _segment_tree;
|
||||
}
|
||||
}
|
||||
|
||||
if segment_tree.get_sum(0, line.len()) <= width {
|
||||
*state = LineBreakTextState::AtLine {
|
||||
cur_index: *cur_index + line.len() + 1,
|
||||
};
|
||||
return Some(
|
||||
line.trim_end_matches(|c| c == '\r' || c == '\n')
|
||||
.to_string(),
|
||||
);
|
||||
}
|
||||
if breaks.len() < 2 {
|
||||
let mut line = line;
|
||||
while !line.is_empty() {
|
||||
let mut chop_index = std::cmp::min(line.len().saturating_sub(1), width);
|
||||
while chop_index > 0 && !line.is_char_boundary(chop_index) {
|
||||
chop_index -= 1;
|
||||
}
|
||||
if chop_index == 0 {
|
||||
self.paragraph.push_back(format!("⤷{}", line));
|
||||
*cur_index += line.len();
|
||||
break;
|
||||
} else {
|
||||
self.paragraph
|
||||
.push_back(format!("⤷{}", &line[..chop_index]));
|
||||
*cur_index += chop_index;
|
||||
}
|
||||
line = &line[chop_index..];
|
||||
}
|
||||
*state = LineBreakTextState::AtLine {
|
||||
cur_index: *cur_index,
|
||||
};
|
||||
if !self.paragraph.is_empty() {
|
||||
return self.paragraph.pop_front();
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
while *prev_break < breaks.len() {
|
||||
let new_off = match breaks[*prev_break..].binary_search_by(|(offset, _)| {
|
||||
segment_tree
|
||||
.get_sum(*within_line_index, offset.saturating_sub(1))
|
||||
.cmp(&width)
|
||||
}) {
|
||||
Ok(v) => v,
|
||||
Err(v) => v,
|
||||
} + *prev_break;
|
||||
let end_offset = if new_off >= breaks.len() {
|
||||
line.len()
|
||||
} else {
|
||||
breaks[new_off].0
|
||||
};
|
||||
if !line[*within_line_index..end_offset].is_empty() {
|
||||
if *within_line_index == 0 {
|
||||
let ret = line[*within_line_index..end_offset]
|
||||
.trim_end_matches(|c| c == '\r' || c == '\n');
|
||||
*within_line_index = end_offset;
|
||||
return Some(ret.to_string());
|
||||
} else {
|
||||
let ret = format!(
|
||||
"⤷{}",
|
||||
&line[*within_line_index..end_offset]
|
||||
.trim_end_matches(|c| c == '\r' || c == '\n')
|
||||
);
|
||||
*within_line_index = end_offset;
|
||||
return Some(ret);
|
||||
}
|
||||
}
|
||||
if *within_line_index == end_offset && *prev_break == new_off {
|
||||
break;
|
||||
}
|
||||
*within_line_index = end_offset + 1;
|
||||
*prev_break = new_off;
|
||||
}
|
||||
*state = LineBreakTextState::AtLine {
|
||||
cur_index: *cur_index + line.len() + 1,
|
||||
};
|
||||
}
|
||||
}
|
||||
ReflowState::ReflowNo { ref mut cur_index }
|
||||
| ReflowState::ReflowAll { ref mut cur_index } => {
|
||||
for line in self.text[*cur_index..].split('\n') {
|
||||
let ret = line.to_string();
|
||||
*cur_index += line.len() + 2;
|
||||
return Some(ret);
|
||||
}
|
||||
return None;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn reflow_helper2(
|
||||
ret: &mut VecDeque<String>,
|
||||
paragraph: &str,
|
||||
quote_depth: usize,
|
||||
in_paragraph: bool,
|
||||
width: Option<usize>,
|
||||
) {
|
||||
if quote_depth > 0 {
|
||||
let quotes: String = ">".repeat(quote_depth);
|
||||
let paragraph = paragraph
|
||||
.trim_start_matches("es)
|
||||
.replace(&format!("\n{}", "es), "")
|
||||
.replace("\n", "")
|
||||
.replace("\r", "");
|
||||
if in_paragraph {
|
||||
if let Some(width) = width {
|
||||
ret.extend(
|
||||
linear(¶graph, width.saturating_sub(quote_depth))
|
||||
.into_iter()
|
||||
.map(|l| format!("{}{}", "es, l)),
|
||||
);
|
||||
} else {
|
||||
ret.push_back(format!("{}{}", "es, ¶graph));
|
||||
}
|
||||
} else {
|
||||
ret.push_back(format!("{}{}", "es, ¶graph));
|
||||
}
|
||||
} else {
|
||||
let paragraph = paragraph.replace("\n", "").replace("\r", "");
|
||||
|
||||
if in_paragraph {
|
||||
if let Some(width) = width {
|
||||
let ex = linear(¶graph, width);
|
||||
ret.extend(ex.into_iter());
|
||||
} else {
|
||||
ret.push_back(paragraph);
|
||||
}
|
||||
} else {
|
||||
ret.push_back(paragraph);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue