Logo Search packages:      
Sourcecode: jruby-joni version File versions  Download package

Matcher.java

/*
 * Permission is hereby granted, free of charge, to any person obtaining a copy of 
 * this software and associated documentation files (the "Software"), to deal in 
 * the Software without restriction, including without limitation the rights to 
 * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies 
 * of the Software, and to permit persons to whom the Software is furnished to do
 * so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 
 * SOFTWARE.
 */

package org.joni;

import static org.joni.Option.isFindLongest;

import org.jcodings.Encoding;
import org.jcodings.IntHolder;
import org.joni.constants.AnchorType;

00029 public abstract class Matcher extends IntHolder {
    protected final Regex regex;
    protected final Encoding enc;

    protected final byte[]bytes;
    protected final int str;
    protected final int end;

    protected int msaStart;
    protected int msaOptions;
    protected final Region msaRegion;
    protected int msaBestLen;
    protected int msaBestS;

    protected int msaBegin;
    protected int msaEnd;

    public Matcher(Regex regex, byte[]bytes) {
        this(regex, bytes, 0, bytes.length);
    }

    public Matcher(Regex regex, byte[]bytes, int p, int end) {
        this.regex = regex;
        this.enc = regex.enc;

        this.bytes = bytes;
        this.str = p;
        this.end = end;

        this.msaRegion = regex.numMem == 0 ? null : new Region(regex.numMem + 1);
    }

    // main matching method
    protected abstract int matchAt(int range, int sstart, int sprev);    

    protected abstract void stateCheckBuffInit(int strLength, int offset, int stateNum);
    protected abstract void stateCheckBuffClear();

    public final Region getRegion() {
        return msaRegion;
    }
    
    public final Region getEagerRegion() {
        return msaRegion != null ? msaRegion : new Region(msaBegin, msaEnd);
    }
    
    public final int getBegin() {
        return msaBegin;
    }
    
    public final int getEnd() {
        return msaEnd;
    }

    protected final void msaInit(int option, int start) {
        msaOptions = option;
        msaStart = start;
        if (Config.USE_FIND_LONGEST_SEARCH_ALL_OF_RANGE) msaBestLen = -1;
    }    

    public final int match(int at, int range, int option) {
        msaInit(option, at);
        
        if (Config.USE_COMBINATION_EXPLOSION_CHECK) {
            int offset = at = str;
            stateCheckBuffInit(end - str, offset, regex.numCombExpCheck); // move it to construction?
        } // USE_COMBINATION_EXPLOSION_CHECK
        
        int prev = enc.prevCharHead(bytes, str, at, end);
        
        if (Config.USE_MATCH_RANGE_MUST_BE_INSIDE_OF_SPECIFIED_RANGE) {
            return matchAt(end /*range*/, at, prev);
        } else {
            return matchAt(range /*range*/, at, prev);
        }
    }
    
    int low, high; // these are the return values
    private boolean forwardSearchRange(byte[]bytes, int str, int end, int s, int range, IntHolder lowPrev) {
        int pprev = -1;
        int p = s;

        if (Config.DEBUG_SEARCH) {
            Config.log.println("forward_search_range: "+
                                "str: " + str +
                                ", end: " + end +
                                ", s: " + s +
                                ", range: " + range);
        }        
        
        if (regex.dMin > 0) {
            if (enc.isSingleByte()) {
                p += regex.dMin;
            } else {
                int q = p + regex.dMin;
                while (p < q) p += enc.length(bytes, p, end);
            }
        }

        retry:while (true) {
            p = regex.searchAlgorithm.search(regex, bytes, p, end, range);
            
            if (p != -1 && p < range) {
                if (p - regex.dMin < s) {
                    // retry_gate:
                    pprev = p;
                    p += enc.length(bytes, p, end);
                    continue retry;
                }
                
                if (regex.subAnchor != 0) {
                    switch (regex.subAnchor) {
                    case AnchorType.BEGIN_LINE:
                        if (p != str) {
                            int prev = enc.prevCharHead(bytes, (pprev != -1) ? pprev : str, p, end);
                            if (!enc.isNewLine(bytes, prev, end)) {
                                // goto retry_gate;
                                pprev = p;
                                p += enc.length(bytes, p, end);
                                continue retry;
                            }
                        }
                        break;
                        
                    case AnchorType.END_LINE:
                        if (p == end) {
                            if (!Config.USE_NEWLINE_AT_END_OF_STRING_HAS_EMPTY_LINE) {
                                int prev = enc.prevCharHead(bytes, (pprev != -1) ? pprev : str, p, end);
                                if (prev != -1 && enc.isNewLine(bytes, prev, end)) {
                                    // goto retry_gate;
                                    pprev = p;
                                    p += enc.length(bytes, p, end);
                                    continue retry;
                                }
                            }
                        } else if (!enc.isNewLine(bytes, p, end) && (!Config.USE_CRNL_AS_LINE_TERMINATOR || !enc.isMbcCrnl(bytes, p, end))) {
                            //if () break;
                            // goto retry_gate;
                            pprev = p;
                            p += enc.length(bytes, p, end);
                            continue retry;
                        }
                        break;
                    } // switch
                }

                if (regex.dMax == 0) {
                    low = p;
                    if (lowPrev != null) { // ??? // remove null checks
                        if (low > s) {
                            lowPrev.value = enc.prevCharHead(bytes, s, p, end);
                        } else {
                            lowPrev.value = enc.prevCharHead(bytes, (pprev != -1) ? pprev : str, p, end);
                        }
                    }
                } else {
                    if (regex.dMax != MinMaxLen.INFINITE_DISTANCE) {
                        low = p - regex.dMax;

                        if (low > s) {
                            low = enc.rightAdjustCharHeadWithPrev(bytes, s, low, end, lowPrev);
                            if (lowPrev != null && lowPrev.value == -1) {
                                lowPrev.value = enc.prevCharHead(bytes, (pprev != -1) ? pprev : s, low, end);
                            }
                        } else {
                            if (lowPrev != null) {
                                lowPrev.value = enc.prevCharHead(bytes, (pprev != -1) ? pprev : str, low, end);
                            }
                        }
                    }
                }
                /* no needs to adjust *high, *high is used as range check only */
                high = p - regex.dMin;
                
                if (Config.DEBUG_SEARCH) {
                    Config.log.println("forward_search_range success: "+
                                        "low: " + (low - str) +
                                        ", high: " + (high - str) +
                                        ", dmin: " + regex.dMin +
                                        ", dmax: " + regex.dMax);
                } 
                
                return true;    /* success */
            }

            return false;   /* fail */
        } //while            
    }
    
    // low, high
    private boolean backwardSearchRange(byte[]bytes, int str, int end, int s, int range, int adjrange) {
        range += regex.dMin;
        int p = s;
        
        retry:while (true) {
            p = regex.searchAlgorithm.searchBackward(regex, bytes, range, adjrange, end, p, s, range);
            
            if (p != -1) {
                if (regex.subAnchor != 0) {
                    switch (regex.subAnchor) {
                    case AnchorType.BEGIN_LINE:
                        if (p != str) {
                            int prev = enc.prevCharHead(bytes, str, p, end);
                            if (!enc.isNewLine(bytes, prev, end)) {
                                p = prev;
                                continue retry;
                            }
                        }
                        break;
                        
                    case AnchorType.END_LINE:
                        if (p == end) {
                            if (!Config.USE_NEWLINE_AT_END_OF_STRING_HAS_EMPTY_LINE) {
                                int prev = enc.prevCharHead(bytes, adjrange, p, end);
                                if (prev == -1) return false;
                                if (enc.isNewLine(bytes, prev, end)) {
                                    p = prev;
                                    continue retry;
                                }
                            }
                        } else if (!enc.isNewLine(bytes, p, end) && (!Config.USE_CRNL_AS_LINE_TERMINATOR || !enc.isMbcCrnl(bytes, p, end))) {
                            p = enc.prevCharHead(bytes, adjrange, p, end);
                            if (p == -1) return false;
                            continue retry;
                        }
                        break;
                    } // switch
                }
                
                /* no needs to adjust *high, *high is used as range check only */
                if (regex.dMax != MinMaxLen.INFINITE_DISTANCE) {
                    low = p - regex.dMax;
                    high = p - regex.dMin;
                    high = enc.rightAdjustCharHead(bytes, adjrange, high, end);
                }
                
                if (Config.DEBUG_SEARCH) {
                    Config.log.println("backward_search_range: "+
                                        "low: " + (low - str) +
                                        ", high: " + (high - str));
                } 
                
                return true;
            }
            
            if (Config.DEBUG_SEARCH) Config.log.println("backward_search_range: fail.");
            return false;
        } // while
    }
    
    // MATCH_AND_RETURN_CHECK
    private boolean matchCheck(int upperRange, int s, int prev) {
        if (Config.USE_MATCH_RANGE_MUST_BE_INSIDE_OF_SPECIFIED_RANGE) {
            if (Config.USE_FIND_LONGEST_SEARCH_ALL_OF_RANGE) {
                //range = upperRange;
                if (matchAt(upperRange, s, prev) != -1) {
                    if (!isFindLongest(regex.options)) return true;
                }
            } else {
                //range = upperRange;
                if (matchAt(upperRange, s, prev) != -1) return true;
            }
        } else {
            if (Config.USE_FIND_LONGEST_SEARCH_ALL_OF_RANGE) {
                if (matchAt(end, s, prev) != -1) {
                    //range = upperRange;
                    if (!isFindLongest(regex.options)) return true;
                }
            } else {
                //range = upperRange;
                if (matchAt(end, s, prev) != -1) return true;
            }            
        }
        return false;
    }
    
    public final int search(int start, int range, int option) {
        int s, prev;
        int origStart = start;
        int origRange = range;
        
        if (Config.DEBUG_SEARCH) {
            Config.log.println("onig_search (entry point): "+
                        "str: " + str + 
                    ", end: " + (end - str) + 
                    ", start: " + (start - str) + 
                    ", range " + (range - str));
        }
        
        if (start > end || start < str) return -1;
        
        /* anchor optimize: resume search range */
        if (regex.anchor != 0 && str < end) {
            int minSemiEnd, maxSemiEnd;
            
            if ((regex.anchor & AnchorType.BEGIN_POSITION) != 0) {
                /* search start-position only */
                // !begin_position:!
                if (range > start) {
                    range = start + 1;
                } else {
                    range = start;
                }
            } else if ((regex.anchor & AnchorType.BEGIN_BUF) != 0) {
                /* search str-position only */
                if (range > start) {
                    if (start != str) return -1; // mismatch_no_msa;
                    range = str + 1;
                } else { 
                    if (range <= str) {
                        start = str;
                        range = str;
                    } else {
                        return -1; // mismatch_no_msa;
                    }
                }
            } else if ((regex.anchor & AnchorType.END_BUF) != 0) {
                minSemiEnd = maxSemiEnd = end;
                // !end_buf:!
                if (endBuf(start, range, minSemiEnd, maxSemiEnd)) return -1; // mismatch_no_msa;
            } else if ((regex.anchor & AnchorType.SEMI_END_BUF) != 0) {
                int preEnd = enc.stepBack(bytes, str, end, end, 1);
                maxSemiEnd = end;
                if (enc.isNewLine(bytes, preEnd, end)) {
                    minSemiEnd = preEnd;
                    if (Config.USE_CRNL_AS_LINE_TERMINATOR) {
                        preEnd = enc.stepBack(bytes, str, preEnd, end, 1);
                        if (preEnd != -1 && enc.isMbcCrnl(bytes, preEnd, end)) {
                            minSemiEnd = preEnd;
                        }
                    }
                    if (minSemiEnd > str && start <= minSemiEnd) {
                        // !goto end_buf;!
                        if (endBuf(start, range, minSemiEnd, maxSemiEnd)) return -1; // mismatch_no_msa;                        
                    }
                } else {
                    minSemiEnd = end;
                    // !goto end_buf;!
                    if (endBuf(start, range, minSemiEnd, maxSemiEnd)) return -1; // mismatch_no_msa;                    
                }
            } else if ((regex.anchor & AnchorType.ANYCHAR_STAR_ML) != 0) {
                // goto !begin_position;!
                if (range > start) {
                    range = start + 1;
                } else {
                    range = start;
                }                
            }
            
        } else if (str == end) { /* empty string */
            // empty address ?
            if (Config.DEBUG_SEARCH) {
                Config.log.println("onig_search: empty string.");
            }
            
            if (regex.thresholdLength == 0) {
                s = start = str;
                prev = -1;
                msaInit(option, start);

                if (Config.USE_COMBINATION_EXPLOSION_CHECK) stateCheckBuffClear();

                if (matchCheck(end, s, prev)) return match(s);
                return mismatch();
            }
            return -1; // goto mismatch_no_msa;
        }
        
        if (Config.DEBUG_SEARCH) {
            Config.log.println("onig_search(apply anchor): " +
                                "end: " + (end - str) +
                                ", start " + (start - str) + 
                                ", range " + (range - str));
        }

        msaInit(option, origStart);
        if (Config.USE_COMBINATION_EXPLOSION_CHECK) {
            int offset = Math.min(start, range) - str;
            stateCheckBuffInit(end - str, offset, regex.numCombExpCheck);
        }
        
        s = start;
        if (range > start) {    /* forward search */
            if (s > str) {
                prev = enc.prevCharHead(bytes, str, s, end);
            } else {
                prev = 0; // -1
            }
            
            if (regex.searchAlgorithm != SearchAlgorithm.NONE) {
                int schRange = range;
                if (regex.dMax != 0) {
                    if (regex.dMax == MinMaxLen.INFINITE_DISTANCE) {
                        schRange = end;
                    } else {
                        schRange += regex.dMax;
                        if (schRange > end) schRange = end;
                    }
                }
                if ((end - start) < regex.thresholdLength) return mismatch();
                
                if (regex.dMax != MinMaxLen.INFINITE_DISTANCE) {
                    do {
                        if (!forwardSearchRange(bytes, str, end, s, schRange, this)) return mismatch(); // low, high, lowPrev
                        if (s < low) {
                            s = low;
                            prev = value;
                        }
                        while (s <= high) {
                            if (matchCheck(origRange, s, prev)) return match(s); // ???
                            prev = s;
                            s += enc.length(bytes, s, end);
                        }
                    } while (s < range);
                    return mismatch();
                    
                } else { /* check only. */
                    if (!forwardSearchRange(bytes, str, end, s, schRange, null)) return mismatch();
                    
                    if ((regex.anchor & AnchorType.ANYCHAR_STAR) != 0) {
                        do {
                            if (matchCheck(origRange, s, prev)) return match(s);
                            prev = s;
                            s += enc.length(bytes, s, end);
                            
                            while (!enc.isNewLine(bytes, prev, end) && s < range) {
                                prev = s;
                                s += enc.length(bytes, s, end);
                            }
                        } while (s < range);
                        return mismatch();
                    }
                    
                }
            }
            
            do {
                if (matchCheck(origRange, s, prev)) return match(s);
                prev = s;
                s += enc.length(bytes, s, end);
            } while (s < range);
            
            if (s == range) { /* because empty match with /$/. */
                if (matchCheck(origRange, s, prev)) return match(s);
            }
        } else { /* backward search */
            if (Config.USE_MATCH_RANGE_MUST_BE_INSIDE_OF_SPECIFIED_RANGE) {            
                if (origStart < end) {
                    origStart += enc.length(bytes, origStart, end); // /* is upper range */ 
                }
            }
            
            if (regex.searchAlgorithm != SearchAlgorithm.NONE) {
                int adjrange;
                if (range < end) {
                    adjrange = enc.leftAdjustCharHead(bytes, str, range, end);
                } else {
                    adjrange = end;
                }
                if (regex.dMax != MinMaxLen.INFINITE_DISTANCE && (end - range) >= regex.thresholdLength) {
                    do {
                        int schStart = s + regex.dMax;
                        if (schStart > end) schStart = end;
                        if (!backwardSearchRange(bytes, str, end, schStart, range, adjrange)) return mismatch(); // low, high
                        if (s > high) s = high;
                        while (s != -1 && s >= low) {
                            prev = enc.prevCharHead(bytes, str, s, end);
                            if (matchCheck(origStart, s, prev)) return match(s);
                            s = prev;
                        }
                    } while (s >= range);
                    return mismatch();
                } else { /* check only. */
                    if ((end - range) < regex.thresholdLength) return mismatch();
                    
                    int schStart = s;
                    if (regex.dMax != 0) {
                        if (regex.dMax == MinMaxLen.INFINITE_DISTANCE) {
                            schStart = end;
                        } else {
                            schStart += regex.dMax;
                            if (schStart > end) {
                                schStart = end;
                            } else {
                                schStart = enc.leftAdjustCharHead(bytes, start, schStart, end);
                            }
                        }
                    }
                    if (!backwardSearchRange(bytes, str, end, schStart, range, adjrange)) return mismatch();
                }
            }
            
            do {
                prev = enc.prevCharHead(bytes, str, s, end);
                if (matchCheck(origStart, s, prev)) return match(s);
                s = prev;
            } while (s >= range);
            
        }
        return mismatch();
    }
    
    private boolean endBuf(int start, int range, int minSemiEnd, int maxSemiEnd) {
        if ((maxSemiEnd - str) < regex.anchorDmin) return true; // mismatch_no_msa;
        
        if (range > start) {
            if ((minSemiEnd - start) > regex.anchorDmax) {
                start = minSemiEnd - regex.anchorDmax;
                if (start < end) {
                    start = enc.rightAdjustCharHead(bytes, str, start, end);
                } else { /* match with empty at end */
                    start = enc.prevCharHead(bytes, str, end, end);
                }
            }
            if ((maxSemiEnd - (range - 1)) < regex.anchorDmin) {
                range = maxSemiEnd - regex.anchorDmin + 1;
            }
            if (start >= range) return true; // mismatch_no_msa;
        } else {
            if ((minSemiEnd - range) > regex.anchorDmax) {
                range = minSemiEnd - regex.anchorDmax;
            }
            if ((maxSemiEnd - start) < regex.anchorDmin) {                    
                start = maxSemiEnd - regex.anchorDmin;
                start = enc.leftAdjustCharHead(bytes, str, start, end);
            }
            if (range > start) return true; // mismatch_no_msa;
        }
        return false;
    }
    
    private int match(int s) {
        return s - str; // sstart ???
    }
    
    private int mismatch() {
        if (Config.USE_FIND_LONGEST_SEARCH_ALL_OF_RANGE) {
            if (msaBestLen >= 0) {
                int s = msaBestS;
                return match(s);
            }
        }
        // falls through finish:
        return -1;
    }
}

Generated by  Doxygen 1.6.0   Back to index