From def62f767ace41c4c1daa71e1a309761d137e04f Mon Sep 17 00:00:00 2001 From: Aakash Panchal <51417248+Aakash-Panchal27@users.noreply.github.com> Date: Thu, 4 Jun 2020 02:22:16 +0530 Subject: [PATCH] Update Z-algorithm.md --- articles/Akash Articles/md/Z-algorithm.md | 258 +++++++++++----------- 1 file changed, 133 insertions(+), 125 deletions(-) diff --git a/articles/Akash Articles/md/Z-algorithm.md b/articles/Akash Articles/md/Z-algorithm.md index 9c87c55..cd7535a 100644 --- a/articles/Akash Articles/md/Z-algorithm.md +++ b/articles/Akash Articles/md/Z-algorithm.md @@ -1,35 +1,39 @@ Z-function and z-algorithm +Z-algorithm is a **string-matching algorithm**, which is used to find a place where a string is found within a larger string. It uses the value of **z-function** for a given string. + +Let's first see what is a **z-function**. + # Z-Algorithm -Z-function for a given string $s$ of length $n$ is an array of length $n$, where $z[i]$ represents length of longest common prefix of string $s$ and suffix of s starting at $i$ i.e. $s[i,n-1]$. +Z-function for a given string $s$ of length $n$ returns an array $z$ of length $n$, where $z[i]$ represents the length of the longest common prefix of string $s$(i.e. $s[0,n-1]$) and suffix of $s$ starting at $i$ i.e. $s[i,n-1]$. **Note:** $s[l,r]$ represents substring of $S$ starting at index $l$ and ending at index $r$. Here, we are taking zero based indices. -Note that value of $z[0]$ is not properly defined so we take it as zero($0$). +Note that the value of $z[0]$ is not properly defined so we take it as zero($0$). For example, 1. $z("cccc") = [0,3,2,1]$ - Why $z[1]=3$? - Because $s[0,2] = s[1,3] = "ccc"$. + Why $z[1]=3$? + Because $s[0,2] = s[1,3] = "ccc"$. 2. $z("ababab")=[0,0,4,0,2,0]$ - Why z[2] = 4? - Because $s[0,3] = s[2,5] = "abab"$. + Why z[2] = 4? + Because $s[0,3] = s[2,5] = "abab"$. 3. $z("abacaba") = [0,0,1,0,3,0,1]$ - Why z[4] = 3? - Because $s[0,2] = s[4,6] = "aba"$. + Why z[4] = 3? + Because $s[0,2] = s[4,6] = "aba"$. -Can you figure out how do we find value of z-function? +Can you figure out how do we find the value of z-function? ## Trivial Algorithm -Basic way to find value of z-function is to do brute force. For index - $i$, we find it following way. +The basic way to find the value of z-function is to do brute force. For index - $i$, we find it following way. ``` z[i] = 0; while(i + z[i] < n && s[z[i]] == s[i + z[i]]) - z[i]++; + z[i]++; ``` -Simply, do this for every indices. +Simply, do this for every index. ```cpp vector z_function(string s) { @@ -60,6 +64,8 @@ We can see that $s[i,r]$ and $s[i-l,r-l]$ are equal. Now, look at $z[i-l]$ and t $z[i-l]$ tells us that $s[0,z[i-l]-1]$ and $s[i-l,i-l+z[i-l]-1]$ are equal and therefore $s[0,z[i-l]-1]$ and $s[i,i+z[i-l]-1]$ are equal, which means that $z[i]=z[i-l]$. +Confused? Go through the series of images below that will make the whole thing clear. + ![enter image description here](https://github.com/KingsGambitLab/Lecture_Notes/blob/master/articles/Akash%20Articles/md/Images/Z-algorithm/3.jpg) ![enter image description here](https://github.com/KingsGambitLab/Lecture_Notes/blob/master/articles/Akash%20Articles/md/Images/Z-algorithm/4.jpg) @@ -76,14 +82,14 @@ Now, we will run brute force algorithm: // As per the discussion z[i] = min(z[i-l],r-i+1); while(i + z[i] < n && s[z[i]] == s[i + z[i]]) - z[i]++; + z[i]++; ``` -After that if $i+z[i]$ is going beyond $r$, then we simply update indices $[l,r]$ to maintain **rightmost segment match** to take advantage of previous values as much as possible for next indices as well. +After that if $i+z[i]$ is going beyond $r$, then we simply update indices $[l,r]$ as $l = i$ and $r = i + z[i]$, to maintain the **rightmost segment match** to take the advantage of previous values as much as possible for next indices as well. -**Note that initially $[l,r]$ segment is taken as $[0,0]$**. So, we basically start by doing brute force, or generally for an index $i$, +**Note that initially $[l,r]$ segment is taken as $[0,0]$**. So, we start by doing brute force, or generally for an index $i$, -1. If $i<=r$, then we wiil take advantage of previous value and then do brute force. +1. If $i<=r$, then we will take advantage of the previous value and then do brute force. 2. Else if $i>r$, we directly do brute force as we can't take advantage of any previous value. ```cpp @@ -92,7 +98,7 @@ vector z_function(string s) { vector z(n); int l = 0, r = 0; for (int i = 1; i < n; ++i) { - // Take advantage of previous value + // Take advantage of previous value if (i <= r) z[i] = min (r - i + 1, z[i - l]); @@ -102,8 +108,8 @@ vector z_function(string s) { // Set new range [l,r] if (i + z[i] - 1 > r) { - l = i; - r = i + z[i] - 1; + l = i; + r = i + z[i] - 1; } } return z; @@ -112,7 +118,7 @@ vector z_function(string s) { ### Time complexity -$O(N)$, as at each step of the algorithm $r$ at least increases one step and maximum possible value of r is $n-1$. +$O(N)$, as at each step of the algorithm $r$ at least increases one step, and the maximum possible value of r is $n-1$. ## Search for a string @@ -122,7 +128,7 @@ For example, `p = "ab"` and `s = "abbbabab"`, then Z-algorithm will find us `[0, Basic idea here is to create a new string having $p$ as a prefix and $s$ as a suffix i.e. `new_str = p + '#' + s`. -**To make sure that the value of Z-function does not exceed length of $p$, we will add an additional character which is never going to appear in string $s$**. +**To make sure that the value of Z-function does not exceed the length of $p$, we will add character which is never going to appear in string $s$**. Now, we will find Z-function of `new_str`. @@ -135,48 +141,48 @@ And therefore **all indices-$i$ where the values of Z-function $Z[i]$ equals to ```cpp int main() { - string s,p; - s = "abbbabab"; - p = "ab"; - int n = s.size(), m = p.size(); + string s,p; + s = "abbbabab"; + p = "ab"; + int n = s.size(), m = p.size(); - // To save memory concatenate - // s in p - p += "#"; - p += s; - // p = "ab#abbbabab"; - vector z = z_function(p); + // To save memory concatenate + // s in p + p += "#"; + p += s; + // p = "ab#abbbabab"; + vector z = z_function(p); - // p = "ab#abbbabab"; - // ^ - // m+1 - cout << "occurences in s at the following indices: "; - for(int i = m + 1; i < z.size(); i++) { - if(z[i] == m) { - cout << i - m - 1 << " "; - } - } - - return 0; + // p = "ab#abbbabab"; + // ^ + // m+1 + cout << "occurences in s at the following indices: "; + for(int i = m + 1; i < z.size(); i++) { + if(z[i] == m) { + cout << i - m - 1 << " "; + } + } + + return 0; } ``` -## To find period of string +## To find the period of a string -Period of string is the shortest length such that a larger string $s$ can be represented as a concatenation of one or more copies of a substring($t$). +Period of a string is the shortest length such that a larger string $s$ can be represented as a concatenation of one or more copies of a substring($t$). For example, `s = "ababab"` has a period of $2$, where `t = "ab"`. -Let's see how to find period of $s$ using value of z-function of $s$. +Let's see how to find the period of $s$ using the value of z-function of $s$. -**First of all note that length of string $s$($n$) is divisible by period of string.** Therefore, we can divide string $s$ into multiple blocks of same length as period of $s$. +**First of all note that the length of string $s$($n$) is divisible by the period of string.** Therefore, we can divide string $s$ into multiple blocks of the same length as a period of $s$. First of all, we will find all divisors of $n$ and value of z-function of $s$. Now, we will need to find smallest divisor of $n$ for which $i+z[i] = n$, which is period of string $s$. Why? $z[i]$ represents length of the longest common prefix of $s[0,n-1]$ and $s[i,n-1]$. As $i$ is divisor of $n$, we can divide the whole string into blocks of length $i$. -From the value of $z[i] = n-i$($\because i+z[i]=n$), we can see that the first block($s[0,i-1]$) is equal to the second block starting at $i$-$s[i,i+i-1]$, which is also equal to third block $s[2*i,3*i-1]$ and similarly all blocks turns out to be equal. +From the value of $z[i] = n-i$($\because i+z[i]=n$), we can see that the first block($s[0,i-1]$) is equal to the second block starting at $i$ i.e. $s[i,i+i-1]$, which is also equal to third block $s[2*i,3*i-1]$ and similarly all blocks turns out to be equal. Therefore, smallest $i$ such that $n\% i=0$ and $i+z[i]=n$, is period of string $s$. If there is no such $i$, then string is not periodic as we cannot divide string into equivalent blocks. @@ -187,36 +193,36 @@ vector getDivisors(int n) for (int i=1; i<=sqrt(n); i++) if (n%i==0) { - v.push_back(i); + v.push_back(i); if (n != i*i) - v.push_back(n/i); + v.push_back(n/i); } return v; } int main() { - string s,p; - s = "abcabcabc"; - int n = (int) s.size(); - vector divs = getDivisors(n); - sort(divs.begin(),divs.end()); - - vector z = z_function(s); - int period = 0; - for(auto i:divs) { - if(i < n && z[i] + i == n) { - period = i; - break; - } - } - - if(period) - cout << period << endl; - else - cout << "String is not periodic" << endl; - - return 0; + string s,p; + s = "abcabcabc"; + int n = (int) s.size(); + vector divs = getDivisors(n); + sort(divs.begin(),divs.end()); + + vector z = z_function(s); + int period = 0; + for(auto i:divs) { + if(i < n && z[i] + i == n) { + period = i; + break; + } + } + + if(period) + cout << period << endl; + else + cout << "String is not periodic" << endl; + + return 0; } ``` @@ -225,55 +231,54 @@ int main() Now, we know how to find a period of a string and therefore we can compress string as only one block of size $i$ which repeats all over again and again in $s$. -To retrive the string back from compressed version, we can attatch its real length i.e. length of $s$. +To retrieve the string back from the compressed version, we can attach its real length i.e. length of $s$. ```cpp int main() { - string s,p; - s = "abcabcabc"; - int n = (int) s.size(); - vector divs = getDivisors(n); - sort(divs.begin(),divs.end()); - - vector z = z_function(s); - int period = 0; - for(auto i:divs) { - if(i < n && z[i] + i == n) { - period = i; - break; - } - } - - if(period != 0) { - // A way to represent compressed string - // Attatch real length of string to retrieve easily - pair compressed_str{s.substr(0,period), n}; - } - else { - cout << "can't be compressed by this method" << endl; - } - - return 0; + string s,p; + s = "abcabcabc"; + int n = (int) s.size(); + vector divs = getDivisors(n); + sort(divs.begin(),divs.end()); + + vector z = z_function(s); + int period = 0; + for(auto i:divs) { + if(i < n && z[i] + i == n) { + period = i; + break; + } + } + + if(period != 0) { + // A way to represent a compressed string + // Attatch real length of string to retrieve easily + pair compressed_str{s.substr(0,period), n}; + } + else { + cout << "can't be compressed by this method" << endl; + } + + return 0; } ``` +## Number of distinct substrings in a string -## Number of distinct substring in a string +**Problem statement:** Find the number of unique substrings in a given string $s$. -**Problem statement:** Find number of unique substrings in a given string $s$. +**Brief idea:** Basic idea here is to take an empty string $t$ and add characters one by one from string $s$ and along with that check how many new substrings are created, due to the addition of a character in $t$, using z-function. -**Brief idea:** Basic idea here is to take an empty string $t$ and add characters one by one from string $s$ and along with that check how many new substrings are created, due to addition of a character in $t$, using z-function. - -Let say we have already added some characters to $t$ from $s$ and $k$ is the number of distinct substrings currently. Now, we are a adding character $c$ to $t$, $t = t+c$. +Let say we have already added some characters to $t$ from $s$ and $k$ is the number of distinct substrings currently. Now, we are adding a character $c$ to $t$, $t = t+c$. Note that total number of new substrings created by appending a character to any string($t$) is equal to the length of new string($t=t+c$) created. **For example, Appending `'d'` in `"abc"` creates 4 new substrings: `"d"`, `"cd"`, `"bcd"`, `"abcd"`.** -But how to find number of new unique substrings created by addition of $c$ **using z-function**? +But how to find the number of new unique substrings created by the addition of $c$ **using z-function**? **Hint:** Reverse $t$. -By reversing $t$, our task burn down into computing how many prefixes there are that don't appear anywhere else in $t$, which can be done by finding z-function of $t$. +By reversing $t$, our task burns down into computing how many prefixes there are that don't appear anywhere else in $t$, which can be done by finding the z-function of $t$. After finding value of z-function, we will find maximum value $z_{max}$($z_{max} = max\{z[i]\}, \forall i$) in the z-function of reversed $t$, which shows the length of longest prefix which is already in $t$ as a substring and it also implies that all smaller prefixes are already present as substrings in $t$. @@ -281,7 +286,7 @@ Therefore, we will deduct this number of already present substrings i.e. $z_{max Where $|t|$ is the length of $t$. -Finally, number of new unique substrings created by addition of a character turns out to be $|t|-z_{max}$. +Finally, the number of new unique substrings created by the addition of a character turns out to be $|t|-z_{max}$. **Note that $|t|$ is the length of $t$ after adding a character.** @@ -293,7 +298,7 @@ int z_function(string& s) { int l = 0, r = 0; int mx = 0; for (int i = 1; i < n; ++i) { - // Take advantage of previous value + // Take advantage of previous value if (i <= r) z[i] = min (r - i + 1, z[i - l]); @@ -305,8 +310,8 @@ int z_function(string& s) { // Set new range [l,r] if (i + z[i] - 1 > r) { - l = i; - r = i + z[i] - 1; + l = i; + r = i + z[i] - 1; } } return mx; @@ -314,24 +319,27 @@ int z_function(string& s) { int main() { - string s,p; - s = "abc"; - int n = s.size(); - - string t, temp; - int unique_substr = 0; - - for(int i=0; i < n; i++) { - t += s[i]; - temp = t; - reverse(temp.begin(), temp.end()); - // |t| - mx - unique_substr += (int)t.size() - z_function(temp); - } + string s,p; + s = "abc"; + int n = s.size(); + + string t, temp; + int unique_substr = 0; + + for(int i=0; i < n; i++) { + t += s[i]; + temp = t; + reverse(temp.begin(), temp.end()); + // |t| - mx + unique_substr += (int)t.size() - z_function(temp); + } - // Total number of unique substrings - cout << unique_substr << endl; - - return 0; + // Total number of unique substrings + cout << unique_substr << endl; + + return 0; } ``` +**Complexity**: $O(N^2)$, where $N$ is the length of $s$. + +For each character appended, we are computing z-function in $O(N)$, which gives a time complexity of $O(N^2)$ in total.