Kasai's Algorithm
Kasai's algorithm is used for constructing the longest common prefix (also referred to as LCP) array from a given suffix array and a text. Once we construct the LCP array, we can efficiently search for a pattern within the given text. We have discussed several algorithms that can solve pattern-matching problems efficiently including the KMP algorithm, the Boyer-Moore algorithm, and the Rabin-Karp algorithm. In this tutorial, we will explore Kasai's algorithm.
How Kasai's Algorithm works?
To understand Kasai' algorithm, we first need to learn the two core concepts of this algorithm −
Suffix Array − This is an array that stores the starting indices of all the suffixes present within a given text in lexicographic order.
LCP Array − As the name suggests, it is the longest common prefix (in short LCP) of two strings is the longest string that is a prefix of both strings.
The Kasai's algorithm is based on the following observation −
If the LCP of two suffixes starting at positions i and j is k, then the LCP of the suffixes starting at i+1 and j+1 is at least k-1, unless one of them is the last suffix in the suffix array. This is because the relative order of the characters in the suffixes remains the same after removing the first character unless they reach the end of the text. Therefore, we can use this property to compute the LCP values in a linear scan of the suffix array, starting from the first suffix and keeping track of the current LCP value in a variable k.
Whenever we move to the next suffix pair, we decrement k by one and then increment it as long as the characters at positions i+k and j+k match. To find the next suffix pair, we use an inverse array that maps each suffix index to its position in the suffix array.
Let's consider the input-output scenario for Kasai's algorithm −
Input: string: "AABAAABCEDBABCDDEBC" Output: Suffix Array: 0 1 9 3 8 2 14 10 4 11 5 15 7 12 13 6 Common Prefix Array: 1 2 3 0 4 1 2 2 0 1 1 1 1 0 1 0
Example
The following example practically demonstrates the Kasai's algorithm in different programming languages.
#include<stdio.h> #include<string.h> #include<stdlib.h> // Defining a structure to represent suffix struct suffixes { int index; int rank[2]; }; // function to compare two suffixes int compare(const void* a, const void* b) { struct suffixes* suf1 = (struct suffixes*)a; struct suffixes* suf2 = (struct suffixes*)b; // If first rank is same if(suf1->rank[0] == suf2->rank[0]) { // Compare second rank return (suf1->rank[1] < suf2->rank[1]) ? -1 : 1; }else { return (suf1->rank[0] < suf2->rank[0]) ? -1 : 1; } } // function to build suffix array int* createSuffArray(char* orgnlString, int n) { struct suffixes suffArray[n]; for (int i = 0; i < n; i++) { suffArray[i].index = i; // Rank based on character itself suffArray[i].rank[0] = orgnlString[i] - 'a'; // Next rank is next character suffArray[i].rank[1] = ((i+1)<n)?(orgnlString[i+1]-'a'):-1; } // Sorting the suffixes qsort(suffArray, n, sizeof(struct suffixes), compare); int index[n]; for (int k = 4; k < 2*n; k = k*2) { int currRank = 0; int prevRank = suffArray[0].rank[0]; suffArray[0].rank[0] = currRank; index[suffArray[0].index] = 0; // to assign rank and index values to first suffix for (int i = 1; i < n; i++) { if (suffArray[i].rank[0] == prevRank && suffArray[i].rank[1] == suffArray[i-1].rank[1]) { prevRank = suffArray[i].rank[0]; // If same as previous rank, assign the same new rank suffArray[i].rank[0] = currRank; } else{ prevRank = suffArray[i].rank[0]; // increment rank and assign suffArray[i].rank[0] = ++currRank; } index[suffArray[i].index] = i; } for (int i = 0; i < n; i++) { int nextIndex = suffArray[i].index + k/2; suffArray[i].rank[1] = (nextIndex < n)? suffArray[index[nextIndex]].rank[0]: -1; } qsort(suffArray, n, sizeof(struct suffixes), compare); } // to store indexes of all sorted suffixes int* suffixVector = (int*)malloc(n * sizeof(int)); for (int i = 0; i < n; i++) suffixVector[i] = suffArray[i].index; return suffixVector; } // applying Kasai's algorithm to build LCP array int* kasaiAlgorithm(char* orgnlString, int* suffixVector, int n) { // To store lcp array int* longPrefix = (int*)malloc(n * sizeof(int)); // To store inverse of suffix array elements int* suffixInverse = (int*)malloc(n * sizeof(int)); // to fill values in suffixInverse[] array for (int i=0; i < n; i++) suffixInverse[suffixVector[i]] = i; int k = 0; for (int i=0; i<n; i++) { if (suffixInverse[i] == n-1) { k = 0; continue; } int j = suffixVector[suffixInverse[i]+1]; while (i+k<n && j+k<n && orgnlString[i+k]==orgnlString[j+k]) k++; longPrefix[suffixInverse[i]] = k; if (k>0) k--; } return longPrefix; } void displayArray(int* vec, int n) { for (int i = 0; i < n; i++) printf("%d ", vec[i]); printf(" "); } int main() { char orgnlString[] = "AAABCAEAAABCBDDAAAABC"; int n = strlen(orgnlString); int* suffArray = createSuffArray(orgnlString, n); printf("Suffix Array is: "); displayArray(suffArray, n); // calling function to build LCP array int* prefixCommn = kasaiAlgorithm(orgnlString, suffArray, n); // Print the LCP array printf("Common Prefix Array is: "); displayArray(prefixCommn, n); return 0; }
#include<iostream> #include<vector> #include<algorithm> using namespace std; // Defining a structure to represent suffix struct suffixes { int index; int rank[2]; }; // function to compare two suffixes bool compare(suffixes suf1, suffixes suf2) { // If first rank is same if(suf1.rank[0] == suf2.rank[0]) { // Compare second rank if(suf1.rank[1] < suf2.rank[1]) return true; else return false; }else { if(suf1.rank[0] < suf2.rank[0]) return true; else return false; } } // function to build suffix array vector<int> createSuffArray(string orgnlString) { int n = orgnlString.size(); suffixes suffArray[n]; for (int i = 0; i < n; i++) { suffArray[i].index = i; // Rank based on character itself suffArray[i].rank[0] = orgnlString[i] - 'a'; // Next rank is next character suffArray[i].rank[1] = ((i+1)<n)?(orgnlString[i+1]-'a'):-1; } // Sorting the suffixes sort(suffArray, suffArray+n, compare); int index[n]; for (int k = 4; k < 2*n; k = k*2) { int currRank = 0; int prevRank = suffArray[0].rank[0]; suffArray[0].rank[0] = currRank; index[suffArray[0].index] = 0; // to assign rank and index values to first suffix for (int i = 1; i < n; i++) { if (suffArray[i].rank[0] == prevRank && suffArray[i].rank[1] == suffArray[i-1].rank[1]) { prevRank = suffArray[i].rank[0]; // If same as previous rank, assign the same new rank suffArray[i].rank[0] = currRank; } else{ prevRank = suffArray[i].rank[0]; // increment rank and assign suffArray[i].rank[0] = ++currRank; } index[suffArray[i].index] = i; } for (int i = 0; i < n; i++) { int nextIndex = suffArray[i].index + k/2; suffArray[i].rank[1] = (nextIndex < n)? suffArray[index[nextIndex]].rank[0]: -1; } sort(suffArray, suffArray+n, compare); } // to store indexes of all sorted suffixes vector<int>suffixVector; for (int i = 0; i < n; i++) suffixVector.push_back(suffArray[i].index); return suffixVector; } // applying Kasai's algorithm to build LCP array vector<int> kasaiAlgorithm(string orgnlString, vector<int> suffixVector) { int n = suffixVector.size(); // To store lcp array vector<int> longPrefix(n, 0); // To store inverse of suffix array elements vector<int> suffixInverse(n, 0); // to fill values in suffixInverse[] array for (int i=0; i < n; i++) suffixInverse[suffixVector[i]] = i; int k = 0; for (int i=0; i<n; i++) { if (suffixInverse[i] == n-1) { k = 0; continue; } int j = suffixVector[suffixInverse[i]+1]; while (i+k<n && j+k<n && orgnlString[i+k]==orgnlString[j+k]) k++; longPrefix[suffixInverse[i]] = k; if (k>0) k--; } return longPrefix; } void displayArray(vector<int> vec) { vector<int>::iterator it; for (it = vec.begin(); it < vec.end() ; it++) cout << *it << " "; cout << endl; } int main() { string orgnlString = "AAABCAEAAABCBDDAAAABC"; vector<int>suffArray = createSuffArray(orgnlString); int n = suffArray.size(); cout<< "Suffix Array is: "<<endl; displayArray(suffArray); // calling function to build LCP array vector<int>prefixCommn = kasaiAlgorithm(orgnlString, suffArray); // Print the LCP array cout<< "Common Prefix Array is: "<<endl; displayArray(prefixCommn); }
import java.util.Arrays; public class Main { // Defining a class to represent suffix static class suffixes { int index; int[] rank = new int[2]; } // method to compare two suffixes static int compare(suffixes suf1, suffixes suf2) { // If first rank is same if (suf1.rank[0] == suf2.rank[0]) { // Compare second rank if (suf1.rank[1] < suf2.rank[1]) return -1; else return 1; } else { if (suf1.rank[0] < suf2.rank[0]) return -1; else return 1; } } // method to build suffix array static int[] createSuffArray(String orgnlString) { int n = orgnlString.length(); suffixes[] suffArray = new suffixes[n]; for (int i = 0; i < n; i++) { suffArray[i] = new suffixes(); suffArray[i].index = i; // Rank based on character itself suffArray[i].rank[0] = orgnlString.charAt(i) - 'a'; // Next rank is next character suffArray[i].rank[1] = ((i + 1) < n) ? (orgnlString.charAt(i + 1) - 'a') : -1; } // Sorting the suffixes Arrays.sort(suffArray, Main::compare); int[] index = new int[n]; for (int k = 4; k < 2 * n; k = k * 2) { int currRank = 0; int prevRank = suffArray[0].rank[0]; suffArray[0].rank[0] = currRank; index[suffArray[0].index] = 0; // to assign rank and index values to first suffix for (int i = 1; i < n; i++) { if (suffArray[i].rank[0] == prevRank && suffArray[i].rank[1] == suffArray[i - 1].rank[1]) { prevRank = suffArray[i].rank[0]; // If same as previous rank, assign the same new rank suffArray[i].rank[0] = currRank; } else { prevRank = suffArray[i].rank[0]; // increment rank and assign suffArray[i].rank[0] = ++currRank; } index[suffArray[i].index] = i; } for (int i = 0; i < n; i++) { int nextIndex = suffArray[i].index + k / 2; suffArray[i].rank[1] = (nextIndex < n) ? suffArray[index[nextIndex]].rank[0] : -1; } Arrays.sort(suffArray, Main::compare); } // to store indexes of all sorted suffixes int[] suffixVector = new int[n]; for (int i = 0; i < n; i++) suffixVector[i] = suffArray[i].index; return suffixVector; } // applying Kasai's algorithm to build LCP array static int[] kasaiAlgorithm(String orgnlString, int[] suffixVector) { int n = suffixVector.length; // To store lcp array int[] longPrefix = new int[n]; // To store inverse of suffix array elements int[] suffixInverse = new int[n]; // to fill values in suffixInverse[] array for (int i = 0; i < n; i++) suffixInverse[suffixVector[i]] = i; int k = 0; for (int i = 0; i < n; i++) { if (suffixInverse[i] == n - 1) { k = 0; continue; } int j = suffixVector[suffixInverse[i] + 1]; while (i + k < n && j + k < n && orgnlString.charAt(i + k) == orgnlString.charAt(j + k)) k++; longPrefix[suffixInverse[i]] = k; if (k > 0) k--; } return longPrefix; } static void displayArray(int[] vec) { for (int i : vec) System.out.print(i + " "); System.out.println(); } public static void main(String[] args) { String orgnlString = "AAABCAEAAABCBDDAAAABC"; int[] suffArray = createSuffArray(orgnlString); System.out.println("Suffix Array is: "); displayArray(suffArray); // calling method to build LCP array int[] prefixCommn = kasaiAlgorithm(orgnlString, suffArray); // Print the LCP array System.out.println("Common Prefix Array is: "); displayArray(prefixCommn); } }
# Defining a class to represent suffix class Suffix: def __init__(self): self.index = 0 self.rank = [0, 0] # function to compare two suffixes def compare(a, b): if a.rank[0] == b.rank[0]: if a.rank[1] < b.rank[1]: return -1 else: return 1 else: if a.rank[0] < b.rank[0]: return -1 else: return 1 # function to build suffix array def createSuffArray(orgnlString): n = len(orgnlString) suffArray = [Suffix() for _ in range(n)] for i in range(n): suffArray[i].index = i suffArray[i].rank[0] = ord(orgnlString[i]) - ord('a') suffArray[i].rank[1] = ord(orgnlString[i + 1]) - ord('a') if ((i + 1) < n) else -1 suffArray = sorted(suffArray, key=lambda x: (x.rank[0], x.rank[1])) ind = [0]*n k = 4 while k < 2*n: rank = 0 prev_rank = suffArray[0].rank[0] suffArray[0].rank[0] = rank ind[suffArray[0].index] = 0 for i in range(1, n): if suffArray[i].rank[0] == prev_rank and suffArray[i].rank[1] == suffArray[i - 1].rank[1]: prev_rank = suffArray[i].rank[0] suffArray[i].rank[0] = rank else: prev_rank = suffArray[i].rank[0] rank += 1 suffArray[i].rank[0] = rank ind[suffArray[i].index] = i for i in range(n): nextIndex = suffArray[i].index + k//2 suffArray[i].rank[1] = suffArray[ind[nextIndex]].rank[0] if (nextIndex < n) else -1 suffArray = sorted(suffArray, key=lambda x: (x.rank[0], x.rank[1])) k *= 2 suffixVector = [0]*n for i in range(n): suffixVector[i] = suffArray[i].index return suffixVector # applying Kasai's algorithm to build LCP array def kasaiAlgorithm(orgnlString, suffixVector): n = len(suffixVector) longPrefix = [0]*n suffixInverse = [0]*n for i in range(n): suffixInverse[suffixVector[i]] = i k = 0 for i in range(n): if suffixInverse[i] == n - 1: k = 0 continue j = suffixVector[suffixInverse[i] + 1] while i + k < n and j + k < n and orgnlString[i + k] == orgnlString[j + k]: k += 1 longPrefix[suffixInverse[i]] = k if k > 0: k -= 1 return longPrefix # Function to print an array def displayArray(vec): for i in vec: print(i, end=' ') print() def main(): orgnlString = "AAABCAEAAABCBDDAAAABC" suffArray = createSuffArray(orgnlString) print("Suffix Array is: ") displayArray(suffArray) prefixCommn = kasaiAlgorithm(orgnlString, suffArray) print("Common Prefix Array is: ") displayArray(prefixCommn) if __name__ == "__main__": main()
Output
Suffix Array is: 15 0 7 16 17 1 8 2 9 18 5 19 3 10 12 4 11 20 14 13 6 Common Prefix Array is: 3 5 5 2 4 4 4 3 3 3 0 2 2 2 0 1 1 1 1 0 0