@Test
public void test1() {
Pattern pattern = Pattern.compile("\\d+");
Matcher matcher = pattern.matcher("123A4234A234");
while (matcher.find()) {
System.out.println(matcher.group());
}
}
public static Pattern compile(String regex) {
return new Pattern(regex, 0);
}
private Pattern(String p, int f) {
pattern = p;
flags = f;
// to use UNICODE_CASE if UNICODE_CHARACTER_CLASS present
if ((flags & UNICODE_CHARACTER_CLASS) != 0)
flags |= UNICODE_CASE;
// Reset group index count
capturingGroupCount = 1;
localCount = 0;
if (pattern.length() > 0) {
// 长度大于0调用compile()
compile();
} else {
root = new Start(lastAccept);
matchRoot = lastAccept;
}
}
/**
* Copies regular expression to an int array and invokes the parsing
* of the expression which will create the object tree.
*/
private void compile() {
// Handle canonical equivalences
if (has(CANON_EQ) && !has(LITERAL)) {
normalize();
} else {
normalizedPattern = pattern;
}
patternLength = normalizedPattern.length();
// Copy pattern to int array for convenience
// Use double zero to terminate pattern
temp = new int[patternLength + 2];
hasSupplementary = false;
int c, count = 0;
// Convert all chars into code points
for (int x = 0; x < patternLength; x += Character.charCount(c)) {
c = normalizedPattern.codePointAt(x);
if (isSupplementary(c)) {
hasSupplementary = true;
}
temp[count++] = c;
}
patternLength = count; // patternLength now in code points
if (! has(LITERAL))
RemoveQEQuoting();
// Allocate all temporary objects here.
buffer = new int[32];
groupNodes = new GroupHead[10];
namedGroups = null;
if (has(LITERAL)) {
// Literal pattern handling
matchRoot = newSlice(temp, patternLength, hasSupplementary);
matchRoot.next = lastAccept;
} else {
// Start recursive descent parsing
// 开始递归下降解析,
matchRoot = expr(lastAccept);
// Check extra pattern characters
if (patternLength != cursor) {
if (peek() == ')') {
throw error("Unmatched closing ')'");
} else {
throw error("Unexpected internal error");
}
}
}
// Peephole optimization
if (matchRoot instanceof Slice) {
root = BnM.optimize(matchRoot);
if (root == matchRoot) {
root = hasSupplementary ? new StartS(matchRoot) : new Start(matchRoot);
}
} else if (matchRoot instanceof Begin || matchRoot instanceof First) {
root = matchRoot;
} else {
root = hasSupplementary ? new StartS(matchRoot) : new Start(matchRoot);
}
// Release temporary storage
temp = null;
buffer = null;
groupNodes = null;
patternLength = 0;
compiled = true;
}
/**
* The expression is parsed with branch nodes added for alternations.
* This may be called recursively to parse sub expressions that may
* contain alternations.
*/
private Node expr(Node end) {
Node prev = null;
Node firstTail = null;
Branch branch = null;
Node branchConn = null;
for (;;) {
Node node = sequence(end);
Node nodeTail = root; //double return
if (prev == null) {
prev = node;
firstTail = nodeTail;
} else {
// Branch
if (branchConn == null) {
branchConn = new BranchConn();
branchConn.next = end;
}
if (node == end) {
// if the node returned from sequence() is "end"
// we have an empty expr, set a null atom into
// the branch to indicate to go "next" directly.
node = null;
} else {
// the "tail.next" of each atom goes to branchConn
nodeTail.next = branchConn;
}
if (prev == branch) {
branch.add(node);
} else {
if (prev == end) {
prev = null;
} else {
// replace the "end" with "branchConn" at its tail.next
// when put the "prev" into the branch as the first atom.
firstTail.next = branchConn;
}
prev = branch = new Branch(prev, node, branchConn);
}
}
if (peek() != '|') {
return prev;
}
next();
}
}
/**
* Parsing of sequences between alternations.
*/
private Node sequence(Node end) {
Node head = null;
Node tail = null;
Node node = null;
LOOP:
for (;;) {
int ch = peek();
switch (ch) {
case '(':
// Because group handles its own closure,
// we need to treat it differently
node = group0();
// Check for comment or flag group
if (node == null)
continue;
if (head == null)
head = node;
else
tail.next = node;
// Double return: Tail was returned in root
tail = root;
continue;
case '[':
node = clazz(true);
break;
case '\\':
ch = nextEscaped();
if (ch == 'p' || ch == 'P') {
boolean oneLetter = true;
boolean comp = (ch == 'P');
ch = next(); // Consume { if present
if (ch != '{') {
unread();
} else {
oneLetter = false;
}
node = family(oneLetter, comp);
} else {
unread();
node = atom();
}
break;
case '^':
next();
if (has(MULTILINE)) {
if (has(UNIX_LINES))
node = new UnixCaret();
else
node = new Caret();
} else {
node = new Begin();
}
break;
case '$':
next();
if (has(UNIX_LINES))
node = new UnixDollar(has(MULTILINE));
else
node = new Dollar(has(MULTILINE));
break;
case '.':
next();
if (has(DOTALL)) {
node = new All();
} else {
if (has(UNIX_LINES))
node = new UnixDot();
else {
node = new Dot();
}
}
break;
case '|':
case ')':
break LOOP;
case ']': // Now interpreting dangling ] and } as literals
case '}':
node = atom();
break;
case '?':
case '*':
case '+':
next();
throw error("Dangling meta character '" + ((char)ch) + "'");
case 0:
if (cursor >= patternLength) {
break LOOP;
}
// Fall through
default:
node = atom();
break;
}
node = closure(node);
if (head == null) {
head = tail = node;
} else {
tail.next = node;
tail = node;
}
}
if (head == null) {
return end;
}
tail.next = end;
root = tail; //double return
return head;
}
@Test
public void test1() {
Pattern pattern = Pattern.compile("\\d+");
Matcher matcher = pattern.matcher("123A4234A234");
while (matcher.find()) {
System.out.println(matcher.group());
}
}
public boolean find() {
int nextSearchIndex = last;
if (nextSearchIndex == first)
nextSearchIndex++;
// If next search starts before region, start it at region
if (nextSearchIndex < from)
nextSearchIndex = from;
// If next search starts beyond region then it fails
if (nextSearchIndex > to) {
for (int i = 0; i < groups.length; i++)
groups[i] = -1;
return false;
}
return search(nextSearchIndex);
}
boolean search(int from) {
this.hitEnd = false;
this.requireEnd = false;
from = from < 0 ? 0 : from;
this.first = from;
this.oldLast = oldLast < 0 ? from : oldLast;
for (int i = 0; i < groups.length; i++)
groups[i] = -1;
acceptMode = NOANCHOR;
boolean result = parentPattern.root.match(this, from, text);
if (!result)
this.first = -1;
this.oldLast = this.last;
return result;
}
static class Node extends Object {
Node next;
Node() {
next = Pattern.accept;
}
/**
* This method implements the classic accept node.
*/
boolean match(Matcher matcher, int i, CharSequence seq) {
matcher.last = i;
matcher.groups[0] = matcher.first;
matcher.groups[1] = matcher.last;
return true;
}
/**
* This method is good for all zero length assertions.
*/
boolean study(TreeInfo info) {
if (next != null) {
return next.study(info);
} else {
return info.deterministic;
}
}
}
static class Start extends Node {
int minLength;
Start(Node node) {
this.next = node;
TreeInfo info = new TreeInfo();
next.study(info);
minLength = info.minLength;
}
boolean match(Matcher matcher, int i, CharSequence seq) {
if (i > matcher.to - minLength) {
matcher.hitEnd = true;
return false;
}
int guard = matcher.to - minLength;
for (; i <= guard; i++) {
if (next.match(matcher, i, seq)) {
matcher.first = i;
matcher.groups[0] = matcher.first;
matcher.groups[1] = matcher.last;
return true;
}
}
matcher.hitEnd = true;
return false;
}
boolean study(TreeInfo info) {
next.study(info);
info.maxValid = false;
info.deterministic = false;
return false;
}
}
static final class UnixCaret extends Node {
boolean match(Matcher matcher, int i, CharSequence seq) {
int startIndex = matcher.from;
int endIndex = matcher.to;
if (!matcher.anchoringBounds) {
startIndex = 0;
endIndex = matcher.getTextLength();
}
// Perl does not match ^ at end of input even after newline
if (i == endIndex) {
matcher.hitEnd = true;
return false;
}
if (i > startIndex) {
char ch = seq.charAt(i-1);
if (ch != '\n') {
return false;
}
}
return next.match(matcher, i, seq);
}
}
static final class UnixDollar extends Node {
boolean multiline;
UnixDollar(boolean mul) {
multiline = mul;
}
boolean match(Matcher matcher, int i, CharSequence seq) {
int endIndex = (matcher.anchoringBounds) ?
matcher.to : matcher.getTextLength();
if (i < endIndex) {
char ch = seq.charAt(i);
if (ch == '\n') {
// If not multiline, then only possible to
// match at very end or one before end
if (multiline == false && i != endIndex - 1)
return false;
// If multiline return next.match without setting
// matcher.hitEnd
if (multiline)
return next.match(matcher, i, seq);
} else {
return false;
}
}
// Matching because at the end or 1 before the end;
// more input could change this so set hitEnd
matcher.hitEnd = true;
// If a $ matches because of end of input, then more input
// could cause it to fail!
matcher.requireEnd = true;
return next.match(matcher, i, seq);
}
boolean study(TreeInfo info) {
next.study(info);
return info.deterministic;
}
}
st=>start: Start
o1=>operation: UnixCaret
o2=>operation: Dollar
o3=>operation: ...
end=>end: End
st->o1->o2->o3->end