Last active
February 28, 2020 07:39
-
-
Save light0x00/0dc5bdc4779a87f8b1d83e74ef60566c to your computer and use it in GitHub Desktop.
《编译原理》 带有哨兵标记的forward指针移动算法
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package practice1; | |
/** | |
* <p> | |
* | |
* </p> | |
* | |
* @author light | |
* @since 2019/10/26 | |
*/ | |
public interface IPushbackReader { | |
/** | |
* 预读下一个待读字符,而不触发forward指针移动 | |
* | |
* @return | |
*/ | |
int peek(); | |
/** | |
* 读取下一个待读字符(n),并移动指针到「该待读字符的后一个(n+1)」处 | |
* | |
* @return | |
*/ | |
int read(); | |
/** | |
* 回退一个字符,需注意调用一次和多次作用是一样的。如需大范围回退,请使用mark/reset | |
*/ | |
void retract(); | |
/** | |
* 标记下一个要读的字符(当前forward指向的字符),该标记可配合reset()回退forward指针 | |
* | |
* @return | |
*/ | |
Marker mark(); | |
/** | |
* forward指针回退到marker | |
* | |
* @param marker 回退点 | |
*/ | |
void reset(Marker marker); | |
/** | |
* 标记已读过的字符可丢弃,这相当于释放可缓存的空闲空间. 具体来讲,begin移至后一个要读的字符(forward)所在位置 | |
*/ | |
void flip(); | |
void close(); | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package practice1; | |
import common.exp.Assert; | |
import common.exp.EnvironmentError; | |
import common.exp.ParseException; | |
import lombok.extern.slf4j.Slf4j; | |
import java.io.IOException; | |
import java.io.Reader; | |
/** | |
* <p> | |
* 《编译原理》P73「带有哨兵标记的forward指针移动算法」实现优化版。 | |
* <p> | |
* 按照推演,哨兵数量可能是1~3个 | |
* 因此,缓冲最大可容纳 BUFFER_SIZE-1 ~ BUFFER_SIZE-3 个字符. 达到该上限之前 必须丢弃一部分缓冲空间,用于容纳新的字符。 | |
* <p> | |
* 如果词素的最大长度为n,由于通常要多读1个字符,因此缓冲的大小则需要 n+1+3. | |
* | |
* </p> | |
* | |
* @author light | |
* @since 2019/10/26 | |
*/ | |
@Slf4j | |
public class PushbackBufferReader implements IPushbackReader { | |
/* 本类内部约定的特殊ascii码 它们被用作哨兵符号 */ | |
private static final char LIMIT = 0; //标识buffer中可读字符的边界 begin~LIMIT为缓冲窗口的大小 | |
private static final char EOF = 1; //输入流已经读取结束 | |
private static final char SKIP = 2; //可以跳过的字符 原则上保证后面至少有一个可读字符 | |
private int _version; | |
private int lexemeBegin; | |
private int forward; | |
private final int BUFFER_SIZE; | |
private char[] buffer; | |
private Reader reader; | |
private boolean hasMore = true; | |
private int lastForward; | |
private int lastVersion; | |
public PushbackBufferReader(Reader r) throws IOException { | |
this(r, 1024); | |
} | |
public PushbackBufferReader(Reader r, int bufferSize) throws IOException { | |
BUFFER_SIZE = bufferSize; | |
buffer = new char[BUFFER_SIZE]; | |
reader = r; | |
fillRemaining(); | |
} | |
/** | |
* 返回forward指向的字符 并将forward指针向后移动一位 | |
*/ | |
public int read() { | |
lastForward = forward; | |
lastVersion = _version; | |
int r = peek(); | |
forward(); | |
return r; | |
} | |
@Override | |
public void retract() { | |
reset(new Marker(lastForward, lastVersion)); | |
} | |
/** | |
* 获得forward指针指向的字符。 | |
*/ | |
public int peek() { | |
/* | |
检查如果为EOF 返回-1 | |
检查是否为LIMIT | |
如果不是 则返回对应字符 | |
如果是则装填另一个buf 并移动forward指针到另一个buf的开头 | |
*/ | |
char peek = buffer[forward]; | |
if (peek == LIMIT) { | |
forward = fillRemaining(); | |
return peek(); | |
} else if (peek == SKIP) { | |
forward(); | |
return peek(); | |
} else if (peek == EOF) { | |
hasMore = false; | |
return -1; | |
} | |
return peek; | |
} | |
void forward() { | |
if (!hasMore) | |
return; | |
if (forward == BUFFER_SIZE - 1) { | |
forward = 0; | |
} else | |
++forward; | |
} | |
/** | |
* 从字符输入流读取字符 填充到「remaining buffer」 | |
* 如果字符流已经读完(返回-1)则插入「EOF」哨兵到buffer,反之插入「LIMIT」哨兵. | |
* EOF标识是否已经读到输入流的末尾,LIMIT标记buffer中可用字符的终止位置. | |
* 因为每次装填都需要留一个位置放置哨兵,因此「remaining buffer」的大小至少为2,如果大小不足,将抛出异常.这通常意味着词素长度超出缓冲区大小. | |
* | |
* @return 返回「another buffer」的起始点 | |
*/ | |
private int fillRemaining() { | |
//断言forward处于LIMIT | |
//因为如果forward在任意都可以填充的话, 那么每次填充都需要遍历buffer寻找LIMIT哨兵的位置,来确定哪些空间被释放了 | |
Assert.isTrue(buffer[forward] == LIMIT, "当且仅当forward处于LIMIT字符时才可以触发装载"); | |
//为了避免同一个LIMIT哨兵重复触发装填 将其置为SKIP | |
buffer[forward] = SKIP; | |
int nextForwardIndex = -1; | |
//两个指针重合 且触发「装填」动作(这意味着两者处于LIMIT哨兵处 或者 是初次装填) | |
//这种情况,buffer可被视为是"空"的 | |
if (lexemeBegin == forward) { | |
fill(0, BUFFER_SIZE - 1); | |
nextForwardIndex = 0; | |
lexemeBegin = 0; | |
} else if (forward > lexemeBegin) { | |
//此种情况可能有左右2块区域可以用于装填 | |
//这里使用从左往右的顺序 优先在右边装填 | |
//也可以实现为 比较哪边空间大 就填充哪边 | |
//右边有至少两个位置 | |
if (forward < BUFFER_SIZE - 2) { | |
fill(forward + 1, BUFFER_SIZE - 1); | |
nextForwardIndex = forward + 1; | |
} | |
//左边有至少两个位置 | |
else if (lexemeBegin > 1) { | |
fill(0, lexemeBegin - 1); | |
nextForwardIndex = 0; | |
} | |
} else { | |
//中间至少有两个空位 | |
if (lexemeBegin - forward > 2) { | |
fill(forward + 1, lexemeBegin - 1); | |
nextForwardIndex = forward + 1; | |
} | |
} | |
if (nextForwardIndex == -1) | |
throw new ParseException("缓冲区空间不足,maxSize:%s", BUFFER_SIZE); | |
forward = nextForwardIndex; | |
return nextForwardIndex; | |
} | |
/** | |
* 从输入流读出字符到指定的缓冲区间,该区间会预留一个缓冲位用于存放哨兵字符(LIMIT/EOF) | |
* | |
* @param startIndex | |
* @param endIndex | |
* @throws IOException | |
*/ | |
private int fill(int startIndex, int endIndex) { | |
//检查溢出 | |
/* | |
startIndex,endIndex 为可用缓冲区边界索引 | |
因此,如果 startIndex==endIndex 表示可用缓冲区长度为1 | |
当 startIndex < endIndex==true时,剩余缓冲区长度至少为2 | |
因为要留出一个哨兵位,所以剩余缓冲区长度至少为2才可以继续装填 */ | |
Assert.isTrue(startIndex < endIndex, "超出缓冲限制(%s)", BUFFER_SIZE); | |
//开始装填 | |
int fillLength = endIndex - startIndex; //这里留出了一个哨兵位 | |
int readNum; | |
try { | |
readNum = reader.read(buffer, startIndex, fillLength); | |
} catch (IOException e) { | |
throw new EnvironmentError(e); | |
} | |
if (readNum == -1) { | |
buffer[startIndex] = EOF; | |
} else if (readNum < fillLength) //-1 或者 未装满 都意味着输入流已经结束 | |
buffer[startIndex + readNum] = EOF; | |
else | |
buffer[startIndex + readNum] = LIMIT; | |
log.trace("装填区间: ({},{}) 区间大小:{},实际装填:{}个字符 + 1个哨兵位", startIndex, endIndex, endIndex - startIndex + 1, readNum); | |
return readNum; | |
} | |
public Marker mark() { | |
return new Marker(forward, _version); | |
} | |
/** | |
* 回退到标记位置. | |
* 如果在在该标记之后buffer被修改过(比如触发了装填),这意味着要回退的位置已经被覆盖,此时抛出异常。 | |
* 通常,这是因为外部调用过flip(),导致本次要回退到的字符被"释放". | |
* | |
* @param marker 回退点 | |
*/ | |
public void reset(Marker marker) { | |
Assert.isTrue(marker.version == _version, "回退失败,buffer版本与mark的版本不一致,请确保mark以后没有调用flip更新版本!"); | |
forward = marker.mark; | |
} | |
/** | |
* 释放的已读的字符所占用的缓冲空间,「index<=read」范围以前的都会被释放. | |
* begin移动到下一个待读的字符处. | |
* 可以理解为已经确定了一个词素,并释放读过的区域,以腾出空间. | |
*/ | |
public void flip() { | |
lexemeBegin = forward; | |
++_version; | |
} | |
public void close() { | |
try { | |
reader.close(); | |
} catch (IOException e) { | |
throw new EnvironmentError(e); | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment