text_processing: implement Unicode line breaking algorithm
Not conforming to the unicode standard yetembed
parent
d84ceca88e
commit
5b679be782
|
@ -8,7 +8,11 @@ edition = "2018"
|
||||||
[[bin]]
|
[[bin]]
|
||||||
name = "emailparse"
|
name = "emailparse"
|
||||||
path = "src/email_parse.rs"
|
path = "src/email_parse.rs"
|
||||||
|
[[bin]]
|
||||||
|
name = "linebreak"
|
||||||
|
path = "src/linebreak.rs"
|
||||||
|
|
||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
melib = { path = "../melib", version = "*" }
|
melib = { path = "../melib", version = "*" }
|
||||||
|
text_processing = { path = "../text_processing", version = "*" }
|
||||||
|
|
|
@ -0,0 +1,162 @@
|
||||||
|
extern crate melib;
|
||||||
|
use melib::Result;
|
||||||
|
use melib::StackVec;
|
||||||
|
|
||||||
|
extern crate text_processing;
|
||||||
|
use text_processing::line_break::*;
|
||||||
|
|
||||||
|
fn cost(i: usize, j: usize, width: usize, minima: &Vec<usize>, offsets: &Vec<usize>) -> usize {
|
||||||
|
let w = offsets[j] - offsets[i] + j - i - 1;
|
||||||
|
if w > width {
|
||||||
|
return 65536 * (w - width);
|
||||||
|
}
|
||||||
|
minima[i] + (width - w) * (width - w)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn smawk(
|
||||||
|
rows: &mut StackVec<usize>,
|
||||||
|
columns: &mut StackVec<usize>,
|
||||||
|
minima: &mut Vec<usize>,
|
||||||
|
breaks: &mut Vec<usize>,
|
||||||
|
width: usize,
|
||||||
|
offsets: &Vec<usize>,
|
||||||
|
) {
|
||||||
|
let mut stack = StackVec::new();
|
||||||
|
let mut i = 0;
|
||||||
|
while i < rows.len() {
|
||||||
|
if stack.len() > 0 {
|
||||||
|
let c = columns[stack.len() - 1];
|
||||||
|
if cost(*stack.iter().last().unwrap(), c, width, minima, offsets)
|
||||||
|
< cost(rows[i], c, width, minima, offsets)
|
||||||
|
{
|
||||||
|
if stack.len() < columns.len() {
|
||||||
|
stack.push(rows[i]);
|
||||||
|
}
|
||||||
|
i += 1;
|
||||||
|
} else {
|
||||||
|
stack.pop();
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
stack.push(rows[i]);
|
||||||
|
i += 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
let rows = &mut stack;
|
||||||
|
if columns.len() > 1 {
|
||||||
|
let mut odd_columns = columns.iter().skip(1).step_by(2).cloned().collect();
|
||||||
|
smawk(rows, &mut odd_columns, minima, breaks, width, offsets);
|
||||||
|
for (i, o) in odd_columns.into_iter().enumerate() {
|
||||||
|
columns.set(2 * i + 1, o);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
let mut i = 0;
|
||||||
|
let mut j = 0;
|
||||||
|
while j < columns.len() {
|
||||||
|
let end = if j + 1 < columns.len() {
|
||||||
|
breaks[columns[j + 1]]
|
||||||
|
} else {
|
||||||
|
*rows.iter().last().unwrap()
|
||||||
|
};
|
||||||
|
let c = cost(rows[i], columns[j], width, minima, offsets);
|
||||||
|
if c < minima[columns[j]] {
|
||||||
|
minima[columns[j]] = c;
|
||||||
|
breaks[columns[j]] = rows[i];
|
||||||
|
}
|
||||||
|
if rows[i] < end {
|
||||||
|
i += 1;
|
||||||
|
} else {
|
||||||
|
j += 2;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn linear(text: &str, width: usize) -> Vec<String> {
|
||||||
|
let mut words = Vec::new();
|
||||||
|
let breaks = LineBreakCandidateIter::new(text).collect::<Vec<(usize, LineBreakCandidate)>>();
|
||||||
|
{
|
||||||
|
let mut prev = 0;
|
||||||
|
for b in breaks {
|
||||||
|
if &text[prev..b.0] != "\n" {
|
||||||
|
words.push(text[prev..b.0].trim_end_matches("\n"));
|
||||||
|
if text[prev..b.0].ends_with("\n") {
|
||||||
|
words.push(" ");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
prev = b.0;
|
||||||
|
}
|
||||||
|
if &text[prev..] != "\n" {
|
||||||
|
words.push(text[prev..].trim_end_matches("\n"));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
let count = words.len();
|
||||||
|
let mut minima = vec![std::usize::MAX - 1; count + 1];
|
||||||
|
minima[0] = 0;
|
||||||
|
let mut offsets = Vec::with_capacity(words.len());
|
||||||
|
offsets.push(0);
|
||||||
|
for w in words.iter() {
|
||||||
|
offsets.push(offsets.iter().last().unwrap() + w.len());
|
||||||
|
}
|
||||||
|
|
||||||
|
let mut breaks = vec![0; count + 1];
|
||||||
|
|
||||||
|
let mut n = count + 1;
|
||||||
|
let mut i = 1;
|
||||||
|
let mut offset = 0;
|
||||||
|
loop {
|
||||||
|
let r = std::cmp::min(n, 2 * i);
|
||||||
|
let edge = i + offset;
|
||||||
|
smawk(
|
||||||
|
&mut (offset..edge).collect(),
|
||||||
|
&mut (edge..(r + offset)).collect(),
|
||||||
|
&mut minima,
|
||||||
|
&mut breaks,
|
||||||
|
width,
|
||||||
|
&offsets,
|
||||||
|
);
|
||||||
|
let x = minima[r - 1 + offset];
|
||||||
|
let mut for_was_broken = false;
|
||||||
|
for j in i..(r - 1) {
|
||||||
|
let y = cost(j + offset, r - 1 + offset, width, &minima, &offsets);
|
||||||
|
if y <= x {
|
||||||
|
n -= j;
|
||||||
|
i = 1;
|
||||||
|
offset += j;
|
||||||
|
for_was_broken = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if !for_was_broken || i >= (r - 1) {
|
||||||
|
if r == n {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
i *= 2;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
let mut lines = Vec::new();
|
||||||
|
let mut j = count;
|
||||||
|
while j > 0 {
|
||||||
|
let mut line = String::new();
|
||||||
|
for i in breaks[j]..j {
|
||||||
|
line.push_str(words[i]);
|
||||||
|
}
|
||||||
|
lines.push(line);
|
||||||
|
j = breaks[j];
|
||||||
|
}
|
||||||
|
lines.reverse();
|
||||||
|
lines
|
||||||
|
}
|
||||||
|
|
||||||
|
fn main() -> Result<()> {
|
||||||
|
let text = std::fs::read_to_string(std::env::args().nth(1).unwrap())?;
|
||||||
|
let paragraphs = text.split("\n\n").collect::<Vec<&str>>();
|
||||||
|
for (i, p) in paragraphs.iter().enumerate() {
|
||||||
|
for l in linear(&p, 72) {
|
||||||
|
println!("{}", l.trim());
|
||||||
|
}
|
||||||
|
if i + 1 < paragraphs.len() {
|
||||||
|
println!("");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Ok(())
|
||||||
|
}
|
|
@ -4,6 +4,7 @@ version = "0.0.1" #:version
|
||||||
authors = ["Manos Pitsidianakis <el13635@mail.ntua.gr>"]
|
authors = ["Manos Pitsidianakis <el13635@mail.ntua.gr>"]
|
||||||
workspace = ".."
|
workspace = ".."
|
||||||
edition = "2018"
|
edition = "2018"
|
||||||
|
build = "build.rs"
|
||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
unicode-segmentation = "1.2.1"
|
unicode-segmentation = "1.2.1"
|
||||||
|
|
|
@ -0,0 +1,73 @@
|
||||||
|
const LINE_BREAK_TABLE_URL: &str = "http://www.unicode.org/Public/UCD/latest/ucd/LineBreak.txt";
|
||||||
|
use std::fs::File;
|
||||||
|
use std::io::prelude::*;
|
||||||
|
use std::io::BufReader;
|
||||||
|
use std::path::PathBuf;
|
||||||
|
use std::process::Command;
|
||||||
|
|
||||||
|
include!("src/types.rs");
|
||||||
|
|
||||||
|
fn main() -> Result<(), std::io::Error> {
|
||||||
|
let mod_path = PathBuf::from("src/tables.rs");
|
||||||
|
if mod_path.exists() {
|
||||||
|
eprintln!(
|
||||||
|
"{} already exists, delete it if you want to replace it.",
|
||||||
|
mod_path.display()
|
||||||
|
);
|
||||||
|
std::process::exit(0);
|
||||||
|
}
|
||||||
|
let mut tmpdir_path = PathBuf::from(
|
||||||
|
std::str::from_utf8(&Command::new("mktemp").arg("-d").output()?.stdout)
|
||||||
|
.unwrap()
|
||||||
|
.trim(),
|
||||||
|
);
|
||||||
|
tmpdir_path.push("LineBreak.txt");
|
||||||
|
Command::new("curl")
|
||||||
|
.args(&["-o", tmpdir_path.to_str().unwrap(), LINE_BREAK_TABLE_URL])
|
||||||
|
.output()?;
|
||||||
|
|
||||||
|
let file = File::open(&tmpdir_path)?;
|
||||||
|
let buf_reader = BufReader::new(file);
|
||||||
|
|
||||||
|
let mut line_break_table: Vec<(u32, u32, LineBreakClass)> = Vec::with_capacity(3800);
|
||||||
|
for line in buf_reader.lines() {
|
||||||
|
let line = line.unwrap();
|
||||||
|
if line.starts_with('#') || line.starts_with(' ') || line.is_empty() {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
let tokens: &str = line.split_whitespace().next().unwrap();
|
||||||
|
|
||||||
|
let semicolon_idx: usize = tokens.chars().position(|c| c == ';').unwrap();
|
||||||
|
/* LineBreak.txt list is ascii encoded so we can assume each char takes one byte: */
|
||||||
|
let chars_str: &str = &tokens[..semicolon_idx];
|
||||||
|
|
||||||
|
let mut codepoint_iter = chars_str.split("..");
|
||||||
|
|
||||||
|
let first_codepoint: u32 =
|
||||||
|
u32::from_str_radix(std::dbg!(codepoint_iter.next().unwrap()), 16).unwrap();
|
||||||
|
|
||||||
|
let sec_codepoint: u32 = codepoint_iter
|
||||||
|
.next()
|
||||||
|
.map(|v| u32::from_str_radix(std::dbg!(v), 16).unwrap())
|
||||||
|
.unwrap_or(first_codepoint);
|
||||||
|
let class = &tokens[semicolon_idx + 1..semicolon_idx + 1 + 2];
|
||||||
|
line_break_table.push((first_codepoint, sec_codepoint, LineBreakClass::from(class)));
|
||||||
|
}
|
||||||
|
|
||||||
|
let mut file = File::create(&mod_path)?;
|
||||||
|
file.write_all(b"use crate::types::LineBreakClass::*;\n")
|
||||||
|
.unwrap();
|
||||||
|
file.write_all(b"use crate::types::LineBreakClass;\n\n")
|
||||||
|
.unwrap();
|
||||||
|
file.write_all(b"const line_break_rules: &'static [(u32, u32, LineBreakClass)] = &[\n")
|
||||||
|
.unwrap();
|
||||||
|
for l in &line_break_table {
|
||||||
|
file.write_all(format!(" (0x{:X}, 0x{:X}, {:?}),\n", l.0, l.1, l.2).as_bytes())
|
||||||
|
.unwrap();
|
||||||
|
}
|
||||||
|
file.write_all(b"];").unwrap();
|
||||||
|
std::fs::remove_file(&tmpdir_path).unwrap();
|
||||||
|
tmpdir_path.pop();
|
||||||
|
std::fs::remove_dir(&tmpdir_path).unwrap();
|
||||||
|
Ok(())
|
||||||
|
}
|
|
@ -1,4 +1,8 @@
|
||||||
pub mod grapheme_clusters;
|
pub mod grapheme_clusters;
|
||||||
|
pub mod line_break;
|
||||||
|
mod tables;
|
||||||
|
mod types;
|
||||||
pub mod wcwidth;
|
pub mod wcwidth;
|
||||||
pub use grapheme_clusters::*;
|
pub use grapheme_clusters::*;
|
||||||
|
pub use line_break::*;
|
||||||
pub use wcwidth::*;
|
pub use wcwidth::*;
|
||||||
|
|
|
@ -0,0 +1,703 @@
|
||||||
|
extern crate unicode_segmentation;
|
||||||
|
use self::unicode_segmentation::UnicodeSegmentation;
|
||||||
|
use crate::tables::LINE_BREAK_RULES;
|
||||||
|
use crate::types::LineBreakClass;
|
||||||
|
use core::cmp::Ordering;
|
||||||
|
use core::iter::Peekable;
|
||||||
|
use core::str::FromStr;
|
||||||
|
use LineBreakClass::*;
|
||||||
|
|
||||||
|
#[derive(Debug, PartialEq)]
|
||||||
|
pub enum LineBreakCandidate {
|
||||||
|
MandatoryBreak,
|
||||||
|
BreakAllowed,
|
||||||
|
// NoBreak, Not used.
|
||||||
|
}
|
||||||
|
|
||||||
|
use LineBreakCandidate::*;
|
||||||
|
|
||||||
|
pub struct LineBreakCandidateIter<'a> {
|
||||||
|
text: &'a str,
|
||||||
|
iter: Peekable<unicode_segmentation::GraphemeIndices<'a>>,
|
||||||
|
pos: usize,
|
||||||
|
/* Needed for rule LB30a */
|
||||||
|
reg_ind_streak: u32,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'a> LineBreakCandidateIter<'a> {
|
||||||
|
pub fn new(text: &'a str) -> Self {
|
||||||
|
LineBreakCandidateIter {
|
||||||
|
text,
|
||||||
|
pos: 0,
|
||||||
|
iter: UnicodeSegmentation::grapheme_indices(text, true).peekable(),
|
||||||
|
reg_ind_streak: 0,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
macro_rules! get_base_character {
|
||||||
|
($grapheme:ident) => {{
|
||||||
|
char::from_str($grapheme.get(0..1).unwrap_or_else(|| {
|
||||||
|
$grapheme.get(0..2).unwrap_or_else(|| {
|
||||||
|
$grapheme
|
||||||
|
.get(0..3)
|
||||||
|
.unwrap_or_else(|| $grapheme.get(0..4).unwrap())
|
||||||
|
})
|
||||||
|
}))
|
||||||
|
}};
|
||||||
|
($grapheme:expr) => {{
|
||||||
|
char::from_str($grapheme.get(0..1).unwrap_or_else(|| {
|
||||||
|
$grapheme.get(0..2).unwrap_or_else(|| {
|
||||||
|
$grapheme
|
||||||
|
.get(0..3)
|
||||||
|
.unwrap_or_else(|| $grapheme.get(0..4).unwrap())
|
||||||
|
})
|
||||||
|
}))
|
||||||
|
}};
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Side effects: none
|
||||||
|
macro_rules! get_class {
|
||||||
|
($grapheme:ident) => {{
|
||||||
|
get_base_character!($grapheme)
|
||||||
|
.map(|char| search_table(char as u32, LINE_BREAK_RULES))
|
||||||
|
.unwrap_or(XX)
|
||||||
|
}};
|
||||||
|
($grapheme:expr) => {{
|
||||||
|
get_base_character!($grapheme)
|
||||||
|
.map(|char| search_table(char as u32, LINE_BREAK_RULES))
|
||||||
|
.unwrap_or(XX)
|
||||||
|
}};
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Side effects: Updates $graph_iter and potentially $idx and $grapheme
|
||||||
|
macro_rules! next_grapheme_class {
|
||||||
|
($graph_iter:ident, $grapheme:ident) => ({
|
||||||
|
if let Some((_, g)) = $graph_iter.next() {
|
||||||
|
$grapheme = g;
|
||||||
|
Some(get_class!(g))
|
||||||
|
} else { None }
|
||||||
|
});
|
||||||
|
(($next_char:ident is $class:expr)) => ({
|
||||||
|
$next_char.is_some() && get_class!(($next_char.unwrap().1)) == $class
|
||||||
|
});
|
||||||
|
(($next_char:ident is $($class:ident),+)) => ({
|
||||||
|
$next_char.is_some() && ($(get_class!(($next_char.unwrap().1)) == $class)||+)
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Returns positions where breaks can happen
|
||||||
|
/// Examples:
|
||||||
|
/// ```
|
||||||
|
/// use text_processing::{self, LineBreakCandidate::{self, *}};
|
||||||
|
/// use text_processing::line_break::LineBreakCandidateIter;
|
||||||
|
///
|
||||||
|
/// assert!(LineBreakCandidateIter::new("").collect::<Vec<(usize, LineBreakCandidate)>>().is_empty());
|
||||||
|
/// assert_eq!(&[(7, BreakAllowed), (12, MandatoryBreak)],
|
||||||
|
/// LineBreakCandidateIter::new("Sample Text.").collect::<Vec<(usize, LineBreakCandidate)>>().as_slice());
|
||||||
|
/// assert_eq!(&[(3, MandatoryBreak), (7, MandatoryBreak), (10, BreakAllowed), (17, MandatoryBreak)],
|
||||||
|
/// LineBreakCandidateIter::new("Sa\nmp\r\nle T(e)xt.").collect::<Vec<(usize, LineBreakCandidate)>>().as_slice());
|
||||||
|
/// ```
|
||||||
|
impl<'a> Iterator for LineBreakCandidateIter<'a> {
|
||||||
|
type Item = (usize, LineBreakCandidate);
|
||||||
|
fn next(&mut self) -> Option<Self::Item> {
|
||||||
|
// After end of text, there are no breaks.
|
||||||
|
if self.pos >= self.text.len() {
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
// LB3 Always break at the end of text
|
||||||
|
if self.pos + 1 == self.text.len() {
|
||||||
|
self.pos += 1;
|
||||||
|
return Some((self.pos, MandatoryBreak));
|
||||||
|
}
|
||||||
|
|
||||||
|
let (idx, mut grapheme) = self.iter.next().unwrap();
|
||||||
|
let LineBreakCandidateIter {
|
||||||
|
ref mut iter,
|
||||||
|
ref text,
|
||||||
|
ref mut reg_ind_streak,
|
||||||
|
ref mut pos,
|
||||||
|
} = self;
|
||||||
|
let iter = iter.by_ref();
|
||||||
|
|
||||||
|
debug_assert_eq!(idx, *pos);
|
||||||
|
|
||||||
|
// LB2 Never break at the start of text
|
||||||
|
if idx == 0 {
|
||||||
|
*pos += grapheme.len();
|
||||||
|
return self.next();
|
||||||
|
}
|
||||||
|
|
||||||
|
let class = get_class!(grapheme);
|
||||||
|
|
||||||
|
if class != RI {
|
||||||
|
*reg_ind_streak = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* LB1 Assign a line breaking class to each code point of the input. Resolve AI, CB, CJ,
|
||||||
|
* SA, SG, and XX into other line breaking classes depending on criteria outside the scope
|
||||||
|
* of this algorithm.
|
||||||
|
*
|
||||||
|
* In the absence of such criteria all characters with a specific combination of original
|
||||||
|
* class and General_Category property value are resolved as follows:
|
||||||
|
* Resolved Original General_Category
|
||||||
|
* AL AI, SG, XX Any
|
||||||
|
* CM SA Only Mn or Mc
|
||||||
|
* AL SA Any except Mn and Mc
|
||||||
|
* NS SJ Any
|
||||||
|
*/
|
||||||
|
|
||||||
|
// TODO: LB1
|
||||||
|
|
||||||
|
/* Check if next character class allows breaks before it */
|
||||||
|
let next_char: Option<&(usize, &str)> = iter.peek();
|
||||||
|
|
||||||
|
match class {
|
||||||
|
BK => {
|
||||||
|
// LB4 Always Break after hard line breaks.
|
||||||
|
*pos += grapheme.len();
|
||||||
|
return Some((*pos, MandatoryBreak));
|
||||||
|
}
|
||||||
|
// LB5 Treat CR followed by LF, as well as CR, LF, and NL as hard line breaks
|
||||||
|
CR if next_grapheme_class!((next_char is LF)) => {
|
||||||
|
*pos += grapheme.len();
|
||||||
|
assert!(Some(LF) == next_grapheme_class!(iter, grapheme));
|
||||||
|
*pos += grapheme.len();
|
||||||
|
return Some((*pos, MandatoryBreak));
|
||||||
|
}
|
||||||
|
CR | LF | NL => {
|
||||||
|
*pos += grapheme.len();
|
||||||
|
return Some((*pos, MandatoryBreak));
|
||||||
|
}
|
||||||
|
_ => {}
|
||||||
|
}
|
||||||
|
if let Some((_, next_grapheme)) = next_char {
|
||||||
|
let next_class = get_class!(next_grapheme);
|
||||||
|
match next_class {
|
||||||
|
/* LB6 Do not break before hard line breaks. × ( BK | CR | LF | NL ) */
|
||||||
|
BK | CR | LF | NL => {
|
||||||
|
*pos += grapheme.len();
|
||||||
|
return self.next();
|
||||||
|
}
|
||||||
|
/* LB7 Do not break before spaces or zero width
|
||||||
|
* space. × SP × ZW */
|
||||||
|
SP | ZW => {
|
||||||
|
*pos += grapheme.len();
|
||||||
|
return self.next();
|
||||||
|
}
|
||||||
|
_ => {}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
match class {
|
||||||
|
ZW => {
|
||||||
|
// LB8 Break before any character following a zero-width space, even if one or more
|
||||||
|
// spaces intervene
|
||||||
|
// ZW SP* ÷
|
||||||
|
*pos += grapheme.len();
|
||||||
|
while Some(SP) == next_grapheme_class!(iter, grapheme) {
|
||||||
|
*pos += grapheme.len();
|
||||||
|
}
|
||||||
|
return Some((*pos, MandatoryBreak));
|
||||||
|
}
|
||||||
|
ZWJ => {
|
||||||
|
// LB8a Do not break after a zero width joiner.
|
||||||
|
*pos += grapheme.len();
|
||||||
|
return self.next();
|
||||||
|
}
|
||||||
|
|
||||||
|
CM => {
|
||||||
|
// LB9 Do not break a combining character sequence; treat it as if it has the line
|
||||||
|
// breaking class of the base character in all of the following rules. Treat ZWJ as
|
||||||
|
// if it were CM.
|
||||||
|
// Treat X (CM | ZWJ)* as if it were X.
|
||||||
|
// where X is any line break class except BK, CR, LF, NL, SP, or ZW.
|
||||||
|
|
||||||
|
/* Unreachable since we break lines based on graphemes, not characters */
|
||||||
|
unreachable!();
|
||||||
|
}
|
||||||
|
WJ => {
|
||||||
|
/*: LB11 Do not break before or after Word joiner and related characters.*/
|
||||||
|
*pos += grapheme.len();
|
||||||
|
/* Get next grapheme */
|
||||||
|
if next_grapheme_class!(iter, grapheme).is_some() {
|
||||||
|
*pos += grapheme.len();
|
||||||
|
}
|
||||||
|
return self.next();
|
||||||
|
}
|
||||||
|
GL => {
|
||||||
|
/*LB12 Non-breaking characters: LB12 Do not break after NBSP and related characters.*/
|
||||||
|
*pos += grapheme.len();
|
||||||
|
return self.next();
|
||||||
|
}
|
||||||
|
_ => {}
|
||||||
|
}
|
||||||
|
if let Some((next_idx, next_grapheme)) = next_char {
|
||||||
|
let next_class = get_class!(next_grapheme);
|
||||||
|
match next_class {
|
||||||
|
GL if ![SP, BA, HY].contains(&class) => {
|
||||||
|
/* LB12a Do not break before NBSP and related characters, except after spaces and
|
||||||
|
* hyphens. [^SP BA HY] × GL
|
||||||
|
* Also LB12 Do not break after NBSP and related characters */
|
||||||
|
*pos += grapheme.len();
|
||||||
|
return self.next();
|
||||||
|
}
|
||||||
|
/* LB13 Do not break before ‘]’ or ‘!’ or ‘;’ or ‘/’, even after spaces. */
|
||||||
|
CL | CP | EX | IS | SY => {
|
||||||
|
*pos = *next_idx;
|
||||||
|
return self.next();
|
||||||
|
}
|
||||||
|
_ => {}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
match class {
|
||||||
|
/* LB13 Do not break before ‘]’ or ‘!’ or ‘;’ or ‘/’, even after spaces. */
|
||||||
|
SP if [CL, CP, EX, IS, SY].contains(&get_class!(text[idx..].trim_start())) => {
|
||||||
|
*pos += grapheme.len();
|
||||||
|
while ![CL, CP, EX, IS, SY].contains(&next_grapheme_class!(iter, grapheme).unwrap())
|
||||||
|
{
|
||||||
|
*pos += grapheme.len();
|
||||||
|
}
|
||||||
|
*pos += grapheme.len();
|
||||||
|
return self.next();
|
||||||
|
}
|
||||||
|
OP => {
|
||||||
|
/* LB14 Do not break after ‘[’, even after spaces.
|
||||||
|
* OP SP* ×
|
||||||
|
*/
|
||||||
|
while let Some((idx, grapheme)) = self.iter.next() {
|
||||||
|
*pos = idx + grapheme.len();
|
||||||
|
if !(get_class!(grapheme) == SP) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return self.next();
|
||||||
|
}
|
||||||
|
QU if get_class!(text[idx..].trim_start()) == OP => {
|
||||||
|
/* LB15 Do not break within ‘”[’, even with intervening spaces.
|
||||||
|
* QU SP* × OP */
|
||||||
|
*pos += grapheme.len();
|
||||||
|
while Some(SP) == next_grapheme_class!(iter, grapheme) {
|
||||||
|
*pos += grapheme.len();
|
||||||
|
}
|
||||||
|
*pos = idx;
|
||||||
|
return self.next();
|
||||||
|
}
|
||||||
|
QU => {
|
||||||
|
/* LB19 Do not break before or after quotation marks, such as ‘ ” ’. */
|
||||||
|
*pos += grapheme.len();
|
||||||
|
if let Some((_, g)) = self.iter.next() {
|
||||||
|
*pos += g.len();
|
||||||
|
}
|
||||||
|
return self.next();
|
||||||
|
}
|
||||||
|
LineBreakClass::CL | LineBreakClass::CP
|
||||||
|
if get_class!(text[idx..].trim_start()) == NS =>
|
||||||
|
{
|
||||||
|
/* LB16 Do not break between closing punctuation and a nonstarter (lb=NS), even with
|
||||||
|
* intervening spaces.
|
||||||
|
* (CL | CP) SP* × NS */
|
||||||
|
*pos += grapheme.len();
|
||||||
|
while Some(SP) == next_grapheme_class!(iter, grapheme) {
|
||||||
|
*pos += grapheme.len();
|
||||||
|
}
|
||||||
|
return self.next();
|
||||||
|
}
|
||||||
|
B2 if get_class!(text[idx..].trim_start()) == B2 => {
|
||||||
|
*pos += grapheme.len();
|
||||||
|
while Some(SP) == next_grapheme_class!(iter, grapheme) {
|
||||||
|
*pos += grapheme.len();
|
||||||
|
}
|
||||||
|
return self.next();
|
||||||
|
}
|
||||||
|
SP => {
|
||||||
|
/* LB18 Break after spaces. SP ÷ */
|
||||||
|
// Space 0x20 is 1 byte long.
|
||||||
|
*pos += 1;
|
||||||
|
return Some((*pos, BreakAllowed));
|
||||||
|
}
|
||||||
|
_ => {}
|
||||||
|
}
|
||||||
|
if let Some((next_idx, next_grapheme)) = next_char {
|
||||||
|
let next_class = get_class!(next_grapheme);
|
||||||
|
match next_class {
|
||||||
|
QU if class != SP => {
|
||||||
|
/* LB19 Do not break before or after quotation marks, such as ‘ ” ’. */
|
||||||
|
*pos = *next_idx + next_grapheme.len();
|
||||||
|
self.iter.next();
|
||||||
|
return self.next();
|
||||||
|
}
|
||||||
|
_ => {}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
match class {
|
||||||
|
CB => {
|
||||||
|
/* LB20 Break before and after unresolved CB. */
|
||||||
|
*pos += grapheme.len();
|
||||||
|
return Some((*pos - 1, BreakAllowed));
|
||||||
|
}
|
||||||
|
/* LB21 Do not break before hyphen-minus, other hyphens, fixed-width spaces, small
|
||||||
|
* kana, and other non-starters, or after acute accents. × BA, × HY, × NS, BB × */
|
||||||
|
BB => {
|
||||||
|
*pos += grapheme.len();
|
||||||
|
return self.next();
|
||||||
|
}
|
||||||
|
_ => {}
|
||||||
|
}
|
||||||
|
|
||||||
|
if let Some((_, next_grapheme)) = next_char {
|
||||||
|
let next_class = get_class!(next_grapheme);
|
||||||
|
match next_class {
|
||||||
|
BA | HY | NS => {
|
||||||
|
/* LB21 Do not break before hyphen-minus, other hyphens, fixed-width spaces, small
|
||||||
|
* kana, and other non-starters, or after acute accents. × BA, × HY, × NS, BB × */
|
||||||
|
*pos += grapheme.len();
|
||||||
|
return self.next();
|
||||||
|
}
|
||||||
|
_ => {}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
match class {
|
||||||
|
HL if next_grapheme_class!((next_char is HY, BA)) => {
|
||||||
|
/* LB21a Don’t break after Hebrew + Hyphen. HL (HY | BA) × */
|
||||||
|
let (idx, next_grapheme) = next_char.unwrap();
|
||||||
|
*pos = idx + next_grapheme.len();
|
||||||
|
self.iter.next();
|
||||||
|
return self.next();
|
||||||
|
}
|
||||||
|
/* LB21b Don’t break between ,Solidus and Hebrew letters. SY × HL */
|
||||||
|
SY if next_grapheme_class!((next_char is HL)) => {
|
||||||
|
let (idx, next_grapheme) = next_char.unwrap();
|
||||||
|
*pos = idx + next_grapheme.len();
|
||||||
|
/* bypass next_char */
|
||||||
|
self.iter.next().unwrap();
|
||||||
|
if let Some((idx, next_grapheme)) = self.iter.next() {
|
||||||
|
*pos = idx + next_grapheme.len();
|
||||||
|
}
|
||||||
|
return self.next();
|
||||||
|
}
|
||||||
|
/* LB22 Do not break between two ellipses, or between letters, numbers or excla-
|
||||||
|
* mations and ellipsis.
|
||||||
|
* Examples: ‘9...’, ‘a...’, ‘H...’
|
||||||
|
* (AL | HL) × IN */
|
||||||
|
AL | HL if next_grapheme_class!((next_char is IN)) => {
|
||||||
|
let (idx, next_grapheme) = next_char.unwrap();
|
||||||
|
*pos = idx + next_grapheme.len();
|
||||||
|
self.iter.next();
|
||||||
|
return self.next();
|
||||||
|
}
|
||||||
|
/* EX × IN */
|
||||||
|
EX if next_grapheme_class!((next_char is IN)) => {
|
||||||
|
let (idx, next_grapheme) = next_char.unwrap();
|
||||||
|
*pos = idx + next_grapheme.len();
|
||||||
|
self.iter.next();
|
||||||
|
return self.next();
|
||||||
|
}
|
||||||
|
EX => {
|
||||||
|
// LB13
|
||||||
|
*pos += grapheme.len();
|
||||||
|
return self.next();
|
||||||
|
}
|
||||||
|
/* (ID | EB | EM) × IN */
|
||||||
|
ID | EB | EM if next_grapheme_class!((next_char is IN)) => {
|
||||||
|
let (idx, next_grapheme) = next_char.unwrap();
|
||||||
|
*pos = idx + next_grapheme.len();
|
||||||
|
self.iter.next();
|
||||||
|
return self.next();
|
||||||
|
}
|
||||||
|
/* IN × IN */
|
||||||
|
IN if next_grapheme_class!((next_char is IN)) => {
|
||||||
|
let (idx, next_grapheme) = next_char.unwrap();
|
||||||
|
*pos = idx + next_grapheme.len();
|
||||||
|
self.iter.next();
|
||||||
|
return self.next();
|
||||||
|
}
|
||||||
|
/* NU × IN */
|
||||||
|
NU if next_grapheme_class!((next_char is IN)) => {
|
||||||
|
let (idx, next_grapheme) = next_char.unwrap();
|
||||||
|
*pos = idx + next_grapheme.len();
|
||||||
|
self.iter.next();
|
||||||
|
return self.next();
|
||||||
|
}
|
||||||
|
/* LB23 Do not break between digits and letters.
|
||||||
|
* (AL | HL) × NU */
|
||||||
|
AL | HL if next_grapheme_class!((next_char is NU)) => {
|
||||||
|
let (idx, next_grapheme) = next_char.unwrap();
|
||||||
|
*pos = idx + next_grapheme.len();
|
||||||
|
self.iter.next();
|
||||||
|
return self.next();
|
||||||
|
}
|
||||||
|
/* NU × (AL | HL) */
|
||||||
|
NU if next_grapheme_class!((next_char is AL, HL)) => {
|
||||||
|
let (idx, next_grapheme) = next_char.unwrap();
|
||||||
|
*pos = idx + next_grapheme.len();
|
||||||
|
self.iter.next();
|
||||||
|
return self.next();
|
||||||
|
}
|
||||||
|
/* LB23a Do not break between numeric prefixes and ideographs, or between ideographs
|
||||||
|
* and numeric postfixes.
|
||||||
|
* PR × (ID | EB | EM) */
|
||||||
|
PR if next_grapheme_class!((next_char is ID, EB, EM)) => {
|
||||||
|
let (idx, next_grapheme) = next_char.unwrap();
|
||||||
|
*pos = idx + next_grapheme.len();
|
||||||
|
self.iter.next();
|
||||||
|
return self.next();
|
||||||
|
}
|
||||||
|
/* (ID | EB | EM) × PO */
|
||||||
|
ID | EB | EM if next_grapheme_class!((next_char is PO)) => {
|
||||||
|
let (idx, next_grapheme) = next_char.unwrap();
|
||||||
|
*pos = idx + next_grapheme.len();
|
||||||
|
self.iter.next();
|
||||||
|
return self.next();
|
||||||
|
}
|
||||||
|
/* B24 Do not break between numeric prefix/postfix and letters, or between
|
||||||
|
letters and prefix/postfix.
|
||||||
|
(PR | PO) × (AL | HL)*/
|
||||||
|
PR | PO if next_grapheme_class!((next_char is AL, HL)) => {
|
||||||
|
let (idx, next_grapheme) = next_char.unwrap();
|
||||||
|
*pos = idx + next_grapheme.len();
|
||||||
|
self.iter.next();
|
||||||
|
return self.next();
|
||||||
|
}
|
||||||
|
/*(AL | HL) × (PR | PO) */
|
||||||
|
AL | HL if next_grapheme_class!((next_char is PR, PO)) => {
|
||||||
|
let (idx, next_grapheme) = next_char.unwrap();
|
||||||
|
*pos = idx + next_grapheme.len();
|
||||||
|
self.iter.next();
|
||||||
|
return self.next();
|
||||||
|
}
|
||||||
|
/* LB25 Do not break between the following pairs of classes relevant to numbers:
|
||||||
|
* CL × PO */
|
||||||
|
CL if next_grapheme_class!((next_char is PO)) => {
|
||||||
|
let (idx, next_grapheme) = next_char.unwrap();
|
||||||
|
*pos = idx + next_grapheme.len();
|
||||||
|
self.iter.next();
|
||||||
|
return self.next();
|
||||||
|
}
|
||||||
|
/* CP × PO */
|
||||||
|
CP if next_grapheme_class!((next_char is PO)) => {
|
||||||
|
let (idx, next_grapheme) = next_char.unwrap();
|
||||||
|
*pos = idx + next_grapheme.len();
|
||||||
|
self.iter.next();
|
||||||
|
return self.next();
|
||||||
|
}
|
||||||
|
/* CL × PR */
|
||||||
|
CL if next_grapheme_class!((next_char is PR)) => {
|
||||||
|
let (idx, next_grapheme) = next_char.unwrap();
|
||||||
|
*pos = idx + next_grapheme.len();
|
||||||
|
self.iter.next();
|
||||||
|
return self.next();
|
||||||
|
}
|
||||||
|
/* CP × PR */
|
||||||
|
CP if next_grapheme_class!((next_char is PR)) => {
|
||||||
|
let (idx, next_grapheme) = next_char.unwrap();
|
||||||
|
*pos = idx + next_grapheme.len();
|
||||||
|
self.iter.next();
|
||||||
|
return self.next();
|
||||||
|
}
|
||||||
|
/* NU × PO */
|
||||||
|
NU if next_grapheme_class!((next_char is PO)) => {
|
||||||
|
let (idx, next_grapheme) = next_char.unwrap();
|
||||||
|
*pos = idx + next_grapheme.len();
|
||||||
|
self.iter.next();
|
||||||
|
return self.next();
|
||||||
|
}
|
||||||
|
/* NU × PR */
|
||||||
|
NU if next_grapheme_class!((next_char is PR)) => {
|
||||||
|
let (idx, next_grapheme) = next_char.unwrap();
|
||||||
|
*pos = idx + next_grapheme.len();
|
||||||
|
self.iter.next();
|
||||||
|
return self.next();
|
||||||
|
}
|
||||||
|
/* PO × OP */
|
||||||
|
PO if next_grapheme_class!((next_char is OP)) => {
|
||||||
|
let (idx, next_grapheme) = next_char.unwrap();
|
||||||
|
*pos = idx + next_grapheme.len();
|
||||||
|
self.iter.next();
|
||||||
|
return self.next();
|
||||||
|
}
|
||||||
|
/* PO × NU */
|
||||||
|
PO if next_grapheme_class!((next_char is NU)) => {
|
||||||
|
let (idx, next_grapheme) = next_char.unwrap();
|
||||||
|
*pos = idx + next_grapheme.len();
|
||||||
|
self.iter.next();
|
||||||
|
return self.next();
|
||||||
|
}
|
||||||
|
/* PR × OP */
|
||||||
|
PR if next_grapheme_class!((next_char is OP)) => {
|
||||||
|
let (idx, next_grapheme) = next_char.unwrap();
|
||||||
|
*pos = idx + next_grapheme.len();
|
||||||
|
self.iter.next();
|
||||||
|
return self.next();
|
||||||
|
}
|
||||||
|
/* PR × NU */
|
||||||
|
PR if next_grapheme_class!((next_char is NU)) => {
|
||||||
|
let (idx, next_grapheme) = next_char.unwrap();
|
||||||
|
*pos = idx + next_grapheme.len();
|
||||||
|
self.iter.next();
|
||||||
|
return self.next();
|
||||||
|
}
|
||||||
|
/* HY × NU */
|
||||||
|
HY if next_grapheme_class!((next_char is NU)) => {
|
||||||
|
let (idx, next_grapheme) = next_char.unwrap();
|
||||||
|
*pos = idx + next_grapheme.len();
|
||||||
|
self.iter.next();
|
||||||
|
return self.next();
|
||||||
|
}
|
||||||
|
/* IS × NU */
|
||||||
|
IS if next_grapheme_class!((next_char is NU)) => {
|
||||||
|
let (idx, next_grapheme) = next_char.unwrap();
|
||||||
|
*pos = idx + next_grapheme.len();
|
||||||
|
self.iter.next();
|
||||||
|
return self.next();
|
||||||
|
}
|
||||||
|
/* NU × NU */
|
||||||
|
NU if next_grapheme_class!((next_char is NU)) => {
|
||||||
|
let (idx, next_grapheme) = next_char.unwrap();
|
||||||
|
*pos = idx + next_grapheme.len();
|
||||||
|
self.iter.next();
|
||||||
|
return self.next();
|
||||||
|
}
|
||||||
|
/* SY × NU */
|
||||||
|
SY if next_grapheme_class!((next_char is NU)) => {
|
||||||
|
let (idx, next_grapheme) = next_char.unwrap();
|
||||||
|
*pos = idx + next_grapheme.len();
|
||||||
|
self.iter.next();
|
||||||
|
return self.next();
|
||||||
|
}
|
||||||
|
/* LB26 Do not break a Korean syllable.
|
||||||
|
* JL × (JL | JV | H2 | H3) */
|
||||||
|
JL if next_grapheme_class!((next_char is JL, JV, H2, H3)) => {
|
||||||
|
let (idx, next_grapheme) = next_char.unwrap();
|
||||||
|
*pos = idx + next_grapheme.len();
|
||||||
|
self.iter.next();
|
||||||
|
return self.next();
|
||||||
|
}
|
||||||
|
/* (JV | H2) × (JV | JT) */
|
||||||
|
JV | H2 if next_grapheme_class!((next_char is JV, JT)) => {
|
||||||
|
let (idx, next_grapheme) = next_char.unwrap();
|
||||||
|
*pos = idx + next_grapheme.len();
|
||||||
|
self.iter.next();
|
||||||
|
return self.next();
|
||||||
|
}
|
||||||
|
/* (JT | H3) × JT */
|
||||||
|
JT | H3 if next_grapheme_class!((next_char is JT)) => {
|
||||||
|
let (idx, next_grapheme) = next_char.unwrap();
|
||||||
|
*pos = idx + next_grapheme.len();
|
||||||
|
self.iter.next();
|
||||||
|
return self.next();
|
||||||
|
}
|
||||||
|
/* LB27 Treat a Korean Syllable Block the same as ID.
|
||||||
|
* (JL | JV | JT | H2 | H3) × IN */
|
||||||
|
JL | JV | JT | H2 | H3 if next_grapheme_class!((next_char is IN)) => {
|
||||||
|
let (idx, next_grapheme) = next_char.unwrap();
|
||||||
|
*pos = idx + next_grapheme.len();
|
||||||
|
self.iter.next();
|
||||||
|
return self.next();
|
||||||
|
}
|
||||||
|
/* (JL | JV | JT | H2 | H3) × PO */
|
||||||
|
JL | JV | JT | H2 | H3 if next_grapheme_class!((next_char is PO)) => {
|
||||||
|
let (idx, next_grapheme) = next_char.unwrap();
|
||||||
|
*pos = idx + next_grapheme.len();
|
||||||
|
self.iter.next();
|
||||||
|
return self.next();
|
||||||
|
}
|
||||||
|
/* PR × (JL | JV | JT | H2 | H3) */
|
||||||
|
PR if next_grapheme_class!((next_char is JL, JV, JT, H2, H3)) => {
|
||||||
|
let (idx, next_grapheme) = next_char.unwrap();
|
||||||
|
*pos = idx + next_grapheme.len();
|
||||||
|
self.iter.next();
|
||||||
|
return self.next();
|
||||||
|
}
|
||||||
|
/* LB28 Do not break between alphabetics (“at”).
|
||||||
|
(AL | HL) × (AL | HL) */
|
||||||
|
AL | HL if next_grapheme_class!((next_char is AL, HL)) => {
|
||||||
|
let (idx, next_grapheme) = next_char.unwrap();
|
||||||
|
*pos = idx + next_grapheme.len();
|
||||||
|
self.iter.next();
|
||||||
|
return self.next();
|
||||||
|
}
|
||||||
|
/* LB29 Do not break between numeric punctuation and alphabetics (“e.g.”).
|
||||||
|
IS × (AL | HL) */
|
||||||
|
IS if next_grapheme_class!((next_char is AL, HL)) => {
|
||||||
|
let (idx, next_grapheme) = next_char.unwrap();
|
||||||
|
*pos = idx + next_grapheme.len();
|
||||||
|
self.iter.next();
|
||||||
|
return self.next();
|
||||||
|
}
|
||||||
|
/* LB30 Do not break between letters, numbers, or ordinary symbols and opening
|
||||||
|
or closing parentheses.
|
||||||
|
(AL | HL | NU) × OP */
|
||||||
|
AL | HL | NU if next_grapheme_class!((next_char is OP)) => {
|
||||||
|
let (idx, next_grapheme) = next_char.unwrap();
|
||||||
|
*pos = idx + next_grapheme.len();
|
||||||
|
self.iter.next();
|
||||||
|
return self.next();
|
||||||
|
}
|
||||||
|
/* CP × (AL | HL | NU) */
|
||||||
|
CP if next_grapheme_class!((next_char is AL, HL , NU)) => {
|
||||||
|
let (idx, next_grapheme) = next_char.unwrap();
|
||||||
|
*pos = idx + next_grapheme.len();
|
||||||
|
self.iter.next();
|
||||||
|
return self.next();
|
||||||
|
}
|
||||||
|
/*LB30b Do not break between an emoji base and an emoji modifier.
|
||||||
|
* EB × EM */
|
||||||
|
EB if next_grapheme_class!((next_char is EM)) => {
|
||||||
|
let (idx, next_grapheme) = next_char.unwrap();
|
||||||
|
*pos = idx + next_grapheme.len();
|
||||||
|
self.iter.next();
|
||||||
|
return self.next();
|
||||||
|
}
|
||||||
|
RI => {
|
||||||
|
/* LB30a Break between two regional indicator symbols if and only if there are an
|
||||||
|
* even number of regional indicators preceding the position of the break.
|
||||||
|
* sot (RI RI)* RI × RI
|
||||||
|
* [^RI] (RI RI)* RI × RI */
|
||||||
|
*reg_ind_streak += 1;
|
||||||
|
*pos += grapheme.len();
|
||||||
|
if *reg_ind_streak % 2 == 1 {
|
||||||
|
return Some((*pos - grapheme.len(), BreakAllowed));
|
||||||
|
}
|
||||||
|
self.iter.next();
|
||||||
|
return self.next();
|
||||||
|
}
|
||||||
|
_ => {
|
||||||
|
*pos += grapheme.len();
|
||||||
|
return Some((*pos - grapheme.len(), BreakAllowed));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn search_table(c: u32, t: &'static [(u32, u32, LineBreakClass)]) -> LineBreakClass {
|
||||||
|
match t.binary_search_by(|&(lo, hi, _)| {
|
||||||
|
if lo <= c && c <= hi {
|
||||||
|
Ordering::Equal
|
||||||
|
} else if hi < c {
|
||||||
|
Ordering::Less
|
||||||
|
} else {
|
||||||
|
Ordering::Greater
|
||||||
|
}
|
||||||
|
}) {
|
||||||
|
Ok(idx) => t[idx].2,
|
||||||
|
Err(_) => XX,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_line_breaks() {
|
||||||
|
let s = "Fell past it.\n\n‘Well!’ thought Alice to herself.";
|
||||||
|
let breaks = LineBreakCandidateIter::new(s).collect::<Vec<(usize, LineBreakCandidate)>>();
|
||||||
|
let mut prev = 0;
|
||||||
|
for b in breaks {
|
||||||
|
println!("{:?}", &s[prev..b.0]);
|
||||||
|
prev = b.0;
|
||||||
|
}
|
||||||
|
println!("{:?}", &s[prev..]);
|
||||||
|
}
|
||||||
|
}
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,102 @@
|
||||||
|
#[derive(Debug, Copy, Clone, PartialEq)]
|
||||||
|
pub enum LineBreakClass {
|
||||||
|
BK,
|
||||||
|
CM,
|
||||||
|
CR,
|
||||||
|
GL,
|
||||||
|
LF,
|
||||||
|
NL,
|
||||||
|
SP,
|
||||||
|
WJ,
|
||||||
|
ZW,
|
||||||
|
ZWJ,
|
||||||
|
AI,
|
||||||
|
AL,
|
||||||
|
B2,
|
||||||
|
BA,
|
||||||
|
BB,
|
||||||
|
CB,
|
||||||
|
CJ,
|
||||||
|
CL,
|
||||||
|
CP,
|
||||||
|
EB,
|
||||||
|
EM,
|
||||||
|
EX,
|
||||||
|
H2,
|
||||||
|
H3,
|
||||||
|
HL,
|
||||||
|
HY,
|
||||||
|
ID,
|
||||||
|
IN,
|
||||||
|
IS,
|
||||||
|
JL,
|
||||||
|
JT,
|
||||||
|
JV,
|
||||||
|
NS,
|
||||||
|
NU,
|
||||||
|
OP,
|
||||||
|
PO,
|
||||||
|
PR,
|
||||||
|
QU,
|
||||||
|
RI,
|
||||||
|
SA,
|
||||||
|
SG,
|
||||||
|
SY,
|
||||||
|
XX,
|
||||||
|
}
|
||||||
|
|
||||||
|
use LineBreakClass::*;
|
||||||
|
|
||||||
|
impl From<&str> for LineBreakClass {
|
||||||
|
fn from(val: &str) -> Self {
|
||||||
|
match val {
|
||||||
|
stringify!(BK) => BK,
|
||||||
|
stringify!(CM) => CM,
|
||||||
|
stringify!(CR) => CR,
|
||||||
|
stringify!(GL) => GL,
|
||||||
|
stringify!(LF) => LF,
|
||||||
|
stringify!(NL) => NL,
|
||||||
|
stringify!(SP) => SP,
|
||||||
|
stringify!(WJ) => WJ,
|
||||||
|
stringify!(ZW) => ZW,
|
||||||
|
stringify!(ZWJ) => ZWJ,
|
||||||
|
stringify!(AI) => AI,
|
||||||
|
stringify!(AL) => AL,
|
||||||
|
stringify!(B2) => B2,
|
||||||
|
stringify!(BA) => BA,
|
||||||
|
stringify!(BB) => BB,
|
||||||
|
stringify!(CB) => CB,
|
||||||
|
stringify!(CJ) => CJ,
|
||||||
|
stringify!(CL) => CL,
|
||||||
|
stringify!(CP) => CP,
|
||||||
|
stringify!(EB) => EB,
|
||||||
|
|
||||||
|
stringify!(EM) => EM,
|
||||||
|
stringify!(EX) => EX,
|
||||||
|
stringify!(H2) => H2,
|
||||||
|
stringify!(H3) => H3,
|
||||||
|
stringify!(HL) => HL,
|
||||||
|
stringify!(HY) => HY,
|
||||||
|
stringify!(ID) => ID,
|
||||||
|
stringify!(IN) => IN,
|
||||||
|
stringify!(IS) => IS,
|
||||||
|
stringify!(JL) => JL,
|
||||||
|
|
||||||
|
stringify!(JT) => JT,
|
||||||
|
stringify!(JV) => JV,
|
||||||
|
stringify!(NS) => NS,
|
||||||
|
stringify!(NU) => NU,
|
||||||
|
stringify!(OP) => OP,
|
||||||
|
stringify!(PO) => PO,
|
||||||
|
stringify!(PR) => PR,
|
||||||
|
stringify!(QU) => QU,
|
||||||
|
stringify!(RI) => RI,
|
||||||
|
stringify!(SA) => SA,
|
||||||
|
|
||||||
|
stringify!(SG) => SG,
|
||||||
|
stringify!(SY) => SY,
|
||||||
|
stringify!(XX) => XX,
|
||||||
|
_ => unreachable!(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue