Skip to content

Commit e3e64e4

Browse files
committed
Optimizations for Seq's implementations of sequ...
Optimizations for Seq's implementations of sequence search algorithms. Contributed by Rex Kerr. Closes SI-4828, no review.
1 parent 333f540 commit e3e64e4

File tree

3 files changed

+278
-69
lines changed

3 files changed

+278
-69
lines changed

src/library/scala/collection/SeqLike.scala

Lines changed: 214 additions & 69 deletions
Original file line numberDiff line numberDiff line change
@@ -339,8 +339,15 @@ trait SeqLike[+A, +Repr] extends IterableLike[A, Repr] with GenSeqLike[A, Repr]
339339
* match the elements of sequence `that`, or `-1` of no such subsequence exists.
340340
*/
341341
def indexOfSlice[B >: A](that: GenSeq[B], from: Int): Int =
342-
if (this.hasDefiniteSize && that.hasDefiniteSize)
343-
SeqLike.indexOf(thisCollection, 0, length, that.seq, 0, that.length, from)
342+
if (this.hasDefiniteSize && that.hasDefiniteSize) {
343+
val l = length
344+
val tl = that.length
345+
val clippedFrom = math.max(0, from)
346+
if (from > l) -1
347+
else if (tl < 1) clippedFrom
348+
else if (l < tl) -1
349+
else SeqLike.kmpSearch(thisCollection, clippedFrom, l, that.seq, 0, tl, true)
350+
}
344351
else {
345352
var i = from
346353
var s: Seq[A] = thisCollection drop i
@@ -374,8 +381,16 @@ trait SeqLike[+A, +Repr] extends IterableLike[A, Repr] with GenSeqLike[A, Repr]
374381
* @return the last index `<= end` such that the elements of this $coll starting at this index
375382
* match the elements of sequence `that`, or `-1` of no such subsequence exists.
376383
*/
377-
def lastIndexOfSlice[B >: A](that: GenSeq[B], end: Int): Int =
378-
SeqLike.lastIndexOf(thisCollection, 0, length, that.seq, 0, that.length, end)
384+
def lastIndexOfSlice[B >: A](that: GenSeq[B], end: Int): Int = {
385+
val l = length
386+
val tl = that.length
387+
val clippedL = math.min(l-tl, end)
388+
389+
if (end < 0) -1
390+
else if (tl < 1) clippedL
391+
else if (l < tl) -1
392+
else SeqLike.kmpSearch(thisCollection, 0, clippedL+tl, that.seq, 0, tl, false)
393+
}
379394

380395
@bridge
381396
def lastIndexOfSlice[B >: A](that: Seq[B], end: Int): Int = lastIndexOfSlice(that: GenSeq[B], end)
@@ -693,58 +708,167 @@ trait SeqLike[+A, +Repr] extends IterableLike[A, Repr] with GenSeqLike[A, Repr]
693708
/** The companion object for trait `SeqLike`.
694709
*/
695710
object SeqLike {
696-
/** A KMP implementation, based on the undoubtedly reliable wikipedia entry.
711+
// KMP search utilities
712+
713+
/** Make sure a target sequence has fast, correctly-ordered indexing for KMP.
697714
*
698-
* @author paulp
699-
* @since 2.8
715+
* @author Rex Kerr
716+
* @since 2.10
717+
* @param W The target sequence
718+
* @param n0 The first element in the target sequence that we should use
719+
* @param n1 The far end of the target sequence that we should use (exclusive)
720+
* @return Target packed in an IndexedSeq (taken from iterator unless W already is an IndexedSeq)
700721
*/
701-
private def KMP[B](S: Seq[B], W: Seq[B]): Option[Int] = {
702-
// trivial cases
703-
if (W.isEmpty) return Some(0)
704-
else if (W drop 1 isEmpty) return (S indexOf W(0)) match {
705-
case -1 => None
706-
case x => Some(x)
707-
}
708-
709-
val T: Array[Int] = {
710-
val arr = new Array[Int](W.length)
711-
var pos = 2
712-
var cnd = 0
713-
arr(0) = -1
714-
arr(1) = 0
715-
while (pos < W.length) {
716-
if (W(pos - 1) == W(cnd)) {
717-
arr(pos) = cnd + 1
718-
pos += 1
719-
cnd += 1
720-
}
721-
else if (cnd > 0) {
722-
cnd = arr(cnd)
723-
}
724-
else {
725-
arr(pos) = 0
726-
pos += 1
727-
}
722+
private def kmpOptimizeWord[B](W: Seq[B], n0: Int, n1: Int, forward: Boolean) = W match {
723+
case iso: IndexedSeq[_] =>
724+
// Already optimized for indexing--use original (or custom view of original)
725+
if (forward && n0==0 && n1==W.length) iso.asInstanceOf[IndexedSeq[B]]
726+
else if (forward) new IndexedSeq[B] {
727+
val length = n1 - n0
728+
def apply(x: Int) = iso(n0 + x).asInstanceOf[B]
728729
}
729-
arr
730-
}
730+
else new IndexedSeq[B] {
731+
def length = n1 - n0
732+
def apply(x: Int) = iso(n1 - 1 - x).asInstanceOf[B]
733+
}
734+
case _ =>
735+
// W is probably bad at indexing. Pack in array (in correct orientation)
736+
// Would be marginally faster to special-case each direction
737+
new IndexedSeq[B] {
738+
private[this] val Warr = new Array[AnyRef](n1-n0)
739+
private[this] val delta = if (forward) 1 else -1
740+
private[this] val done = if (forward) n1-n0 else -1
741+
val wit = W.iterator.drop(n0)
742+
var i = if (forward) 0 else (n1-n0-1)
743+
while (i != done) {
744+
Warr(i) = wit.next.asInstanceOf[AnyRef]
745+
i += delta
746+
}
731747

732-
var m, i = 0
733-
def mi = m + i
748+
val length = n1 - n0
749+
def apply(x: Int) = Warr(x).asInstanceOf[B]
750+
}
751+
}
734752

735-
while (mi < S.length) {
736-
if (W(i) == S(mi)) {
737-
i += 1
738-
if (i == W.length)
739-
return Some(m)
753+
/** Make a jump table for KMP search.
754+
*
755+
* @author paulp, Rex Kerr
756+
* @since 2.10
757+
* @param Wopt The target sequence, as at least an IndexedSeq
758+
* @param wlen Just in case we're only IndexedSeq and not IndexedSeqOptimized
759+
* @return KMP jump table for target sequence
760+
*/
761+
private def kmpJumpTable[B](Wopt: IndexedSeq[B], wlen: Int) = {
762+
val arr = new Array[Int](wlen)
763+
var pos = 2
764+
var cnd = 0
765+
arr(0) = -1
766+
arr(1) = 0
767+
while (pos < wlen) {
768+
if (Wopt(pos-1) == Wopt(cnd)) {
769+
arr(pos) = cnd + 1
770+
pos += 1
771+
cnd += 1
772+
}
773+
else if (cnd > 0) {
774+
cnd = arr(cnd)
740775
}
741776
else {
742-
m = mi - T(i)
743-
if (i > 0)
744-
i = T(i)
777+
arr(pos) = 0
778+
pos += 1
745779
}
746780
}
747-
None
781+
arr
782+
}
783+
784+
/** A KMP implementation, based on the undoubtedly reliable wikipedia entry.
785+
* Note: I made this private to keep it from entering the API. That can be reviewed.
786+
*
787+
* @author paulp, Rex Kerr
788+
* @since 2.10
789+
* @param S Sequence that may contain target
790+
* @param m0 First index of S to consider
791+
* @param m1 Last index of S to consider (exclusive)
792+
* @param W Target sequence
793+
* @param n0 First index of W to match
794+
* @param n1 Last index of W to match (exclusive)
795+
* @param forward Direction of search (from beginning==true, from end==false)
796+
* @return Index of start of sequence if found, -1 if not (relative to beginning of S, not m0).
797+
*/
798+
private def kmpSearch[B](S: Seq[B], m0: Int, m1: Int, W: Seq[B], n0: Int, n1: Int, forward: Boolean): Int = {
799+
// Check for redundant case when target has single valid element
800+
@inline def clipR(x: Int, y: Int) = if (x<y) x else -1
801+
@inline def clipL(x: Int, y: Int) = if (x>y) x else -1
802+
803+
if (n1 == n0+1) {
804+
if (forward)
805+
clipR(S.indexOf(W(n0), m0), m1)
806+
else
807+
clipL(S.lastIndexOf(W(n0), m1-1), m0-1)
808+
}
809+
810+
// Check for redundant case when both sequences are same size
811+
else if (m1-m0 == n1-n0) {
812+
// Accepting a little slowness for the uncommon case.
813+
if (S.view.slice(m0, m1) == W.view.slice(n0, n1)) m0
814+
else -1
815+
}
816+
// Now we know we actually need KMP search, so do it
817+
else S match {
818+
case xs: IndexedSeq[_] =>
819+
// We can index into S directly; it should be adequately fast
820+
val Wopt = kmpOptimizeWord(W, n0, n1, forward)
821+
val T = kmpJumpTable(Wopt, n1-n0)
822+
var i, m = 0
823+
val zero = if (forward) m0 else m1-1
824+
val delta = if (forward) 1 else -1
825+
while (i+m < m1-m0) {
826+
if (Wopt(i) == S(zero+delta*(i+m))) {
827+
i += 1
828+
if (i == n1-n0) return (if (forward) m+m0 else m1-m-i)
829+
}
830+
else {
831+
val ti = T(i)
832+
m += i - ti
833+
if (i > 0) i = ti
834+
}
835+
}
836+
-1
837+
case _ =>
838+
// We had better not index into S directly!
839+
val iter = S.iterator.drop(m0)
840+
val Wopt = kmpOptimizeWord(W, n0, n1, true)
841+
val T = kmpJumpTable(Wopt, n1-n0)
842+
var cache = new Array[AnyRef](n1-n0) // Ring buffer--need a quick way to do a look-behind
843+
var largest = 0
844+
var i, m = 0
845+
var answer = -1
846+
while (m+m0+n1-n0 <= m1) {
847+
while (i+m >= largest) {
848+
cache(largest%(n1-n0)) = iter.next.asInstanceOf[AnyRef]
849+
largest += 1
850+
}
851+
if (Wopt(i) == cache((i+m)%(n1-n0))) {
852+
i += 1
853+
if (i == n1-n0) {
854+
if (forward) return m+m0
855+
else {
856+
i -= 1
857+
answer = m+m0
858+
val ti = T(i)
859+
m += i - ti
860+
if (i > 0) i = ti
861+
}
862+
}
863+
}
864+
else {
865+
val ti = T(i)
866+
m += i - ti
867+
if (i > 0) i = ti
868+
}
869+
}
870+
answer
871+
}
748872
}
749873

750874
/** Finds a particular index at which one sequence occurs in another sequence.
@@ -768,15 +892,27 @@ object SeqLike {
768892
def indexOf[B](
769893
source: Seq[B], sourceOffset: Int, sourceCount: Int,
770894
target: Seq[B], targetOffset: Int, targetCount: Int,
771-
fromIndex: Int): Int = {
772-
val toDrop = fromIndex max 0
773-
val src = source.slice(sourceOffset, sourceCount) drop toDrop
774-
val tgt = target.slice(targetOffset, targetCount)
775-
776-
KMP(src, tgt) match {
777-
case None => -1
778-
case Some(x) => x + toDrop
779-
}
895+
fromIndex: Int
896+
): Int = {
897+
// Fiddle with variables to match previous behavior and use kmpSearch
898+
// Doing LOTS of max/min, both clearer and faster to use math._
899+
val slen = source.length
900+
val clippedFrom = math.max(0, fromIndex)
901+
val s0 = math.min(slen, sourceOffset + clippedFrom)
902+
val s1 = math.min(slen, s0 + sourceCount)
903+
val tlen = target.length
904+
val t0 = math.min(tlen, targetOffset)
905+
val t1 = math.min(tlen, t0 + targetCount)
906+
907+
// Error checking
908+
if (clippedFrom > slen-sourceOffset) -1 // Cannot return an index in range
909+
else if (t1 - t0 < 1) s0 // Empty, matches first available position
910+
else if (s1 - s0 < t1 - t0) -1 // Source is too short to find target
911+
else {
912+
// Nontrivial search
913+
val ans = kmpSearch(source, s0, s1, target, t0, t1, true)
914+
if (ans < 0) ans else ans - math.min(slen, sourceOffset)
915+
}
780916
}
781917

782918
/** Finds a particular index at which one sequence occurs in another sequence.
@@ -787,18 +923,27 @@ object SeqLike {
787923
def lastIndexOf[B](
788924
source: Seq[B], sourceOffset: Int, sourceCount: Int,
789925
target: Seq[B], targetOffset: Int, targetCount: Int,
790-
fromIndex: Int): Int = {
791-
if (fromIndex < 0) return -1
792-
val toTake = (fromIndex + targetCount) min sourceCount
793-
// Given seq 1234567 looking for abc, we need to take an extra
794-
// abc.length chars to examine beyond what is dictated by fromIndex.
795-
val src = source.slice(sourceOffset, sourceCount) take toTake reverse
796-
val tgt = target.slice(targetOffset, targetCount).reverse
797-
798-
// then we reverse the adjustment here on success.
799-
KMP(src, tgt) match {
800-
case None => -1
801-
case Some(x) => src.length - x - targetCount
802-
}
926+
fromIndex: Int
927+
): Int = {
928+
// Fiddle with variables to match previous behavior and use kmpSearch
929+
// Doing LOTS of max/min, both clearer and faster to use math._
930+
val slen = source.length
931+
val tlen = target.length
932+
val s0 = math.min(slen, sourceOffset)
933+
val s1 = math.min(slen, s0 + sourceCount)
934+
val clippedFrom = math.min(s1 - s0, fromIndex)
935+
val t0 = math.min(tlen, targetOffset)
936+
val t1 = math.min(tlen, t0 + targetCount)
937+
val fixed_s1 = math.min(s1, s0 + clippedFrom + (t1 - t0) - 1)
938+
939+
// Error checking
940+
if (clippedFrom < 0) -1 // Cannot return an index in range
941+
else if (t1 - t0 < 1) s0+clippedFrom // Empty, matches last available position
942+
else if (fixed_s1 - s0 < t1 - t0) -1 // Source is too short to find target
943+
else {
944+
// Nontrivial search
945+
val ans = kmpSearch(source, s0, fixed_s1, target, t0, t1, false)
946+
if (ans < 0) ans else ans - s0
803947
}
948+
}
804949
}
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
6 6
2+
5 10
3+
-1 -1
4+
4 4
Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
object Test {
2+
import scala.collection.SeqLike
3+
def slowSearch[A](xs: Seq[A], ys: Seq[A], start: Int = 0): Int = {
4+
if (xs startsWith ys) start
5+
else if (xs.isEmpty) -1
6+
else slowSearch(xs.tail, ys, start+1)
7+
}
8+
def bkwSlowSearch[A](xs: Seq[A], ys: Seq[A]) = {
9+
val i = slowSearch(xs.reverse, ys.reverse)
10+
if (i<0) i
11+
else xs.length - ys.length - i
12+
}
13+
def main(args: Array[String]) {
14+
val rng = new scala.util.Random(java.lang.Integer.parseInt("kmp",36))
15+
16+
// Make sure we agree with naive implementation
17+
for (h <- Array(2,5,1000)) {
18+
for (i <- 0 to 100) {
19+
for (j <- 0 to 10) {
20+
val xs = (0 to j).map(_ => (rng.nextInt & 0x7FFFFFFF) % h)
21+
val xsa = xs.toArray
22+
val xsv = Vector() ++ xs
23+
val xsl = xs.toList
24+
val xss = Vector[Seq[Int]](xs,xsa,xsv,xsl)
25+
for (k <- 0 to 5) {
26+
val ys = (0 to k).map(_ => (rng.nextInt & 0x7FFFFFFF) % h)
27+
val ysa = ys.toArray
28+
val ysv = Vector() ++ ys
29+
val ysl = ys.toList
30+
val yss = Vector[Seq[Int]](ys,ysa,ysv,ysl)
31+
val fwd_slow = slowSearch(xs,ys)
32+
val bkw_slow = bkwSlowSearch(xs,ys)
33+
val fwd_fast = xss.flatMap(xs => yss.map(ys => SeqLike.indexOf(xs,0,xs.length,ys,0,ys.length,0)))
34+
val bkw_fast = xss.flatMap(xs => yss.map(ys => SeqLike.lastIndexOf(xs,0,xs.length,ys,0,ys.length,xs.length)))
35+
assert(fwd_fast.forall(_ == fwd_slow))
36+
assert(bkw_fast.forall(_ == bkw_slow))
37+
}
38+
}
39+
}
40+
}
41+
42+
// Check performance^Wcorrectness of common small test cases
43+
val haystacks = List[Seq[Int]](
44+
Array(1,2,3,4,5,6,7,8,9,10,11,12,13,14,15),
45+
Vector(99,2,99,99,2,99,99,99,2,99,99,99,99,2),
46+
List(1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1),
47+
1 to 15
48+
)
49+
val needles = List[Seq[Int]](
50+
Array(7,8,9,10),
51+
Vector(99,99,99),
52+
List(1,1,1,1,1,2),
53+
5 to 9
54+
)
55+
(haystacks zip needles) foreach {
56+
case (hay, nee) =>
57+
println(hay.indexOfSlice(nee,2) + " " + hay.lastIndexOfSlice(nee,13))
58+
}
59+
}
60+
}

0 commit comments

Comments
 (0)