MySQL: Compare differences between two tables - sql

Same as oracle diff: how to compare two tables? except in mysql.
Suppose I have two tables, t1 and t2 which are identical in layout but which may contain different data.
What's the best way to diff these two tables?
To be more precise, I'm trying to figure out a simple SQL query that tells me if data from one row in t1 is different from the data from the corresponding row in t2
It appears I cannot use the intersect nor minus. When I try
SELECT * FROM robot intersect SELECT * FROM tbd_robot
I get an error code:
[Error Code: 1064, SQL State: 42000] You have an error in your SQL
syntax; check the manual that corresponds to your MySQL server version
for the right syntax to use near 'SELECT * FROM tbd_robot' at line 1
Am I doing something syntactically wrong? If not, is there another query I can use?
Edit: Also, I'm querying through a free version DbVisualizer. Not sure if that might be a factor.

INTERSECT needs to be emulated in MySQL:
SELECT 'robot' AS `set`, r.*
FROM robot r
WHERE ROW(r.col1, r.col2, …) NOT IN
(
SELECT col1, col2, ...
FROM tbd_robot
)
UNION ALL
SELECT 'tbd_robot' AS `set`, t.*
FROM tbd_robot t
WHERE ROW(t.col1, t.col2, …) NOT IN
(
SELECT col1, col2, ...
FROM robot
)

You can construct the intersection manually using UNION. It's easy if you have some unique field in both tables, e.g. ID:
SELECT * FROM T1
WHERE ID NOT IN (SELECT ID FROM T2)
UNION
SELECT * FROM T2
WHERE ID NOT IN (SELECT ID FROM T1)
If you don't have a unique value, you can still expand the above code to check for all fields instead of just the ID, and use AND to connect them (e.g. ID NOT IN(...) AND OTHER_FIELD NOT IN(...) etc)

I found another solution in this link
SELECT MIN (tbl_name) AS tbl_name, PK, column_list
FROM
(
SELECT ' source_table ' as tbl_name, S.PK, S.column_list
FROM source_table AS S
UNION ALL
SELECT 'destination_table' as tbl_name, D.PK, D.column_list
FROM destination_table AS D
) AS alias_table
GROUP BY PK, column_list
HAVING COUNT(*) = 1
ORDER BY PK

select t1.user_id,t2.user_id
from t1 left join t2 ON t1.user_id = t2.user_id
and t1.username=t2.username
and t1.first_name=t2.first_name
and t1.last_name=t2.last_name
try this. This will compare your table and find all matching pairs, if any mismatch return NULL on left.

Based on Haim's answer I created a PHP code to test and display all the differences between two databases.
This will also display if a table is present in source or test databases.
You have to change with your details the <> variables content.
<?php
$User = "<DatabaseUser>";
$Pass = "<DatabasePassword>";
$SourceDB = "<SourceDatabase>";
$TestDB = "<DatabaseToTest>";
$link = new mysqli( "p:". "localhost", $User, $Pass, "" );
if ( mysqli_connect_error() ) {
die('Connect Error ('. mysqli_connect_errno() .') '. mysqli_connect_error());
}
mysqli_set_charset( $link, "utf8" );
mb_language( "uni" );
mb_internal_encoding( "UTF-8" );
$sQuery = 'SELECT TABLE_NAME FROM INFORMATION_SCHEMA.TABLES WHERE TABLE_SCHEMA="'. $SourceDB .'";';
$SourceDB_Content = query( $link, $sQuery );
if ( !is_array( $SourceDB_Content) ) {
echo "Table $SourceDB cannot be accessed";
exit(0);
}
$sQuery = 'SELECT TABLE_NAME FROM INFORMATION_SCHEMA.TABLES WHERE TABLE_SCHEMA="'. $TestDB .'";';
$TestDB_Content = query( $link, $sQuery );
if ( !is_array( $TestDB_Content) ) {
echo "Table $TestDB cannot be accessed";
exit(0);
}
$SourceDB_Tables = array();
foreach( $SourceDB_Content as $item ) {
$SourceDB_Tables[] = $item["TABLE_NAME"];
}
$TestDB_Tables = array();
foreach( $TestDB_Content as $item ) {
$TestDB_Tables[] = $item["TABLE_NAME"];
}
//var_dump( $SourceDB_Tables, $TestDB_Tables );
$LookupTables = array_merge( $SourceDB_Tables, $TestDB_Tables );
$NoOfDiscrepancies = 0;
echo "
<table border='1' width='100%'>
<tr>
<td>Table</td>
<td>Found in $SourceDB (". count( $SourceDB_Tables ) .")</td>
<td>Found in $TestDB (". count( $TestDB_Tables ) .")</td>
<td>Test result</td>
<tr>
";
foreach( $LookupTables as $table ) {
$FoundInSourceDB = in_array( $table, $SourceDB_Tables ) ? 1 : 0;
$FoundInTestDB = in_array( $table, $TestDB_Tables ) ? 1 : 0;
echo "
<tr>
<td>$table</td>
<td><input type='checkbox' ". ($FoundInSourceDB == 1 ? "checked" : "") ."></td>
<td><input type='checkbox' ". ($FoundInTestDB == 1 ? "checked" : "") ."></td>
<td>". compareTables( $SourceDB, $TestDB, $table ) ."</td>
</tr>
";
}
echo "
</table>
<br><br>
No of discrepancies found: $NoOfDiscrepancies
";
function query( $link, $q ) {
$result = mysqli_query( $link, $q );
$errors = mysqli_error($link);
if ( $errors > "" ) {
echo $errors;
exit(0);
}
if( $result == false ) return false;
else if ( $result === true ) return true;
else {
$rset = array();
while ( $row = mysqli_fetch_assoc( $result ) ) {
$rset[] = $row;
}
return $rset;
}
}
function compareTables( $source, $test, $table ) {
global $link;
global $NoOfDiscrepancies;
$sQuery = "
SELECT column_name,ordinal_position,data_type,column_type FROM
(
SELECT
column_name,ordinal_position,
data_type,column_type,COUNT(1) rowcount
FROM information_schema.columns
WHERE
(
(table_schema='$source' AND table_name='$table') OR
(table_schema='$test' AND table_name='$table')
)
AND table_name IN ('$table')
GROUP BY
column_name,ordinal_position,
data_type,column_type
HAVING COUNT(1)=1
) A;
";
$result = query( $link, $sQuery );
$data = "";
if( is_array( $result ) && count( $result ) > 0 ) {
$NoOfDiscrepancies++;
$data = "<table><tr><td>column_name</td><td>ordinal_position</td><td>data_type</td><td>column_type</td></tr>";
foreach( $result as $item ) {
$data .= "<tr><td>". $item["column_name"] ."</td><td>". $item["ordinal_position"] ."</td><td>". $item["data_type"] ."</td><td>". $item["column_type"] ."</td></tr>";
}
$data .= "</table>";
return $data;
}
else {
return "Checked but no discrepancies found!";
}
}
?>

Problem below, is to compare table before and after i do big update!.
If you use Linux, you can use commands as follow:
In terminal,
mysqldump -hlocalhost -uroot -p schema_name_here table_name_here > /home/ubuntu/database_dumps/dump_table_before_running_update.sql
mysqldump -hlocalhost -uroot -p schema_name_here table_name_here > /home/ubuntu/database_dumps/dump_table_after_running_update.sql
diff -uP /home/ubuntu/database_dumps/dump_some_table_after_running_update.sql /home/ubuntu/database_dumps/dump_table_before_running_update.sql > /home/ubuntu/database_dumps/diff.txt
You will need online tools for
Formatting SQL exported from the dumps,
e.g http://www.dpriver.com/pp/sqlformat.htm [Not the best I've seen]
We have diff.txt, you have to take manually the + - showing inside, which is 1 line of insert statements, that has the values.
Do diff online for the 2 lines - & + in diff.txt, past them in online diff tool
e.g https://www.diffchecker.com [you can save and share it, and has no limit on file size!]
Note: be extra careful if its sensitive/production data!

you can try The big data comparison platform in https://github.com/zhugezifang/dataCompare
this is a introduction of it
Design and practice of open source big data comparison platform
1. Background & current situation
In the process of developing large numbers, it is often encountered that data migration or upgrade, or different business parties have processed data according to their needs, but think that the data on both sides is still the same, so it will be necessary to manually compare the data. So is the data on both sides consistent? If not, what are the differences?
If there is no platform, you need to manually write some SQL scripts for comparison, and there is no evaluation standard. This is inefficient.
"Alibaba's Road to Big Data" actually mentions such a platform, but because it is not used externally, the introduction in the book is relatively simple. Based on previous work experience, a big data comparison platform was developed to assist in verifying data, named dataCompare.
Main solutions:
(1) Verify data and data comparison, which wastes great labor costs
(2) Without a set of standards, the results of verification are difficult to evaluate
(3) Automatic data verification and comparison can be achieved by interface interaction, check or low-code
[enter image description here][1]
2. Purpose
(1) Automatic data verification and comparison can be achieved by interface interaction, check or low-code.
(2) The data team's data comparison efficiency is increased by at least about 50%.
(3) A set of unified data verification scheme to meet the standard specifications of data verification and comparison
3. System architecture design
4. The current version has implemented the following functions
(1) Low-code simple configuration completes the core function of data comparison
(2) Data magnitude comparison and data consistency comparison
5. Follow-up Development Plan
(1) Discrepancy case finding
(2) Data pointer detection---- enumeration value detection, range detection, numerical detection, primary key mode detection
(3) Data comparison task is scheduled and automatically scheduled
(4) Automatically send an email report to the comparison results
6. The core code is opening in githup
https://github.com/zhugezifang/dataCompare
[enter image description here][1]

Based on Haim's answer here's a simplified example if you're looking to compare values that exist in BOTH tables, otherwise if there's a row in one table but not the other it will also return it....
Took me a couple of hours to figure out. Here's a fully tested simply query for comparing "tbl_a" and "tbl_b"
SELECT ID, col
FROM
(
SELECT
tbl_a.ID, tbl_a.col FROM tbl_a
UNION ALL
SELECT
tbl_b.ID, tbl_b.col FROM tbl_b
) t
WHERE ID IN (select ID from tbl_a) AND ID IN (select ID from tbl_b)
GROUP BY
ID, col
HAVING COUNT(*) = 1
ORDER BY ID
So you need to add the extra "where in" clause:
WHERE ID IN (select ID from tbl_a) AND ID IN (select ID from tbl_b)
Also:
For ease of reading if you want to indicate the table names you can use the following:
SELECT tbl, ID, col
FROM
(
SELECT
tbl_a.ID, tbl_a.col, "name_to_display1" as "tbl" FROM tbl_a
UNION ALL
SELECT
tbl_b.ID, tbl_b.col, "name_to_display2" as "tbl" FROM tbl_b
) t
WHERE ID IN (select ID from tbl_a) AND ID IN (select ID from tbl_b)
GROUP BY
ID, col
HAVING COUNT(*) = 1
ORDER BY ID

you can user my own developed tool
https://github.com/hardeepvicky/MySql-Schema-Compare

I tried the above answer but found that if one table has null values and the second table has values in a column then the intersect code above does not report this fact.
select p.pcn,p.period,p.account_no,p.ytd_debit,a.ytd_debit
-- select count(*) -- 157,283
from Plex.account_period_balance p -- 157,283/202207,148,998
join Azure.account_period_balance a -- 157,283/202207,148,998
on p.pcn = a.pcn
and p.period = a.period
and p.account_no = a.account_no -- 157,283
where p.period_display = a.period_display -- 157,283
and p.debit = a.debit -- 157,283
-- and p.ytd_debit = a.ytd_debit -- 148,998
-- and p.ytd_debit != a.ytd_debit -- 0

Related

PostgreSQL stored function with multiple queries

I'm working on a PostgreSQL function, but having a real struggle with it. It's not my main area so that's probably why, but I wanted to see if this is doable.
I'm trying to create a function to generate records based on year.
In php I'd do something along the lines of:
function recordsByYear($year=''){
$years = array();
if(empty($year)){
$sql = "SELECT year FROM myTable GROUP BY year ORDER BY year;";
$res = $conn->query($sql);
if($res !== false){
$data = $res->fetchAll(PDO::FETCH_COLUMN);
foreach($data as $oneYear){
$years[] = $oneYear;
}
}
}
else{
$years[] = $year;
}
foreach($years as $thisYear){
//do some queries based on $thisYear
}
}
But I'd like to create a function inside of PostgreSQL, and struggling because I'm not familiar enough with how it all works. I'd like to return a table. I can do some basic stuff, but haven't been able to get something like this working where I have one query then loop through those results running an additional query for each year, then combine the results of that second query and spit out the results as a table.
Is hard to guess without more information but you probably need a JOIN
SELECT *
FROM (
SELECT DISTINCT year
FROM myTable
) y
JOIN ( SELECT year, ... <some query>
....
) t
ON y.year = t.year
PostgreSQL function:
CREATE FUNCTION record_years() RETURNS SETOF int AS $$
SELECT DISTINCT year FROM myTable ORDER BY 1
$$ LANGUAGE SQL;
Your PHP code:
$sql = "SELECT record_years()";
or
$sql = "SELECT year FROM record_years() AS year";
or alternatively all values in a single row, as a JSON array:
$sql = "SELECT array_to_json(ARRAY(SELECT record_years())) AS arr";
$res = $conn->query($sql);
if($res !== false){
$row = $res->fetch();
$years[] = json_decode($row['arr']);
}

How to retrieve data from 2 column in one table?

I have a table named accessories_other in my database. In the table, I have column :
1) Item
2)Available
This is the illustration on how the data in the respective column.
Item
Mouse
Keyboard
Cable
Available
4
6
3
The thing is, I would like to select Item = 'Mouse' together with column 'Available'=4. If the available mouse is less than 5, it will send me an email for the next step. But I stuck until this stage.
This is SQL statement that I create, and it count each row for 'Available' column, and send the email if the row of Available column is less than 5, which is not I want.
$sql ="SELECT Item, Available FROM accessories_other WHERE Item ='Mouse'
AND Available <5";
How do I do so that it can retrieve mouse which is the availability less than 5.
This is just to show how it could be done . You should be using MySQLi
or PDO . Also if in Production environment , you should not be
displaying MySQL errors to the user .
You could do it either way :
// SQL to find Available value for Item Mouse
$sql = "SELECT Item, Available FROM accessories_other
WHERE Item = 'Mouse'";
$result = mysql_query( $sql ) or die( 'Query failed.'.mysql_error() );
$row = mysql_fetch_array( $result ) ;
if( mysql_num_rows( $result ) > 0 )
{
echo $row['Item'] ,': ' ,$row['Available'];
if( $row['Available'] < 5 )
{
// Code to send email
}
else
{
// Code to what ever you would like to do here
}
}
or
// SQL to find Available value for Item Mouse if it is less than 5
$sql = "SELECT Item, Available FROM accessories_other
WHERE Item = 'Mouse' AND Available < 5";
$result = mysql_query( $sql ) or die( 'Query failed.'.mysql_error() );
if( mysql_num_rows( $result ) > 0 )
{
// Code to what ever you would like to do here
}
else
{
echo $row['Item'] ,': ' ,$row['Available'];
// Code to send email
}
I think your query will not show results of mouse which is lesser than 5..
I suggest you try:
$sql ="SELECT Item, Available FROM accessories_other WHERE Item ='Mouse';
and then, try to implement your code in another language..
if I'm not mistaken, you're using php..
your query should return exactly one row if you have data in table "accessories" as shown here. unless you have duplicates like multiple rows rows with Item = 'Mouse' and same or different value which is also less than 5 then only query will return multiple results.
also just notice that in the explanation you use the table "accessories" but in sample query you have used table "accessories_other". make sure you are working against the right table.

Using more than one queries on perl DBI

I'm sure there is a better way to do this on the same line, but I'm unable to figure out how, since I'm a beginner in Perl. Basically what I need to do is select, delete and count the results.
$sth = $dbh->prepare("SELECT env,server, mwp.is_reference where env='$ARGV[1]';");
$sth->execute();
$sth2 = $dbh->prepare("delete from mwp.is_info_package where env='$ARGV[1]'");
$sth2->execute();
$sth3 = $dbh->prepare("SELECT count(1) from mwp.is_reference where env='$ARGV[1]'");
$sth3->execute()
The objective is how do i use the 3 queries at the same line, instead having 3 executes.
Well you could start out using placeholders ( '?' ).
my #qlist
= ( 'SELECT env,server FROM mwp.is_reference where env=?'
, 'DELETE mwp.is_info_package WHERE env=?'
, 'SELECT count(1) FROM mwp.is_reference where env=?'
);
And then you can iterate through them like this:
my $env = $ARGV[1];
foreach my $query ( #qlist ) {
$dbh->prepare( $query )->execute( $env );
Carp::croak( $dbh->errstr ) if $dbh->err;
}
But of course, you really want to select the two outputs, don't you?
use Carp qw<croak>;
my $select_query = 'SELECT env,server FROM mwp.is_reference where env=?';
my $delete_query = 'DELETE mwp.is_info_package WHERE env=?';
my $count_query = 'SELECT count(1) FROM mwp.is_reference where env=?';
my %empty_atts;
my $rows
= $dbh->selectall_arrayref( $select_query, \%empty_atts, $env )
;
croak( $dbh->errstr ) if $dbh->err;
$dbh->prepare( $delete_query )->execute( $env );
croak( $dbh->errstr ) if $dbh->err;
my ( $count )
= $dbh->selectrow_array( $count_query, \%empty_atts, $env )
;
croak( $dbh->errstr ) if $dbh->err;
I solved the problem using the following query statement:
$sth = $dbh->prepare("select env,iserver, ( select count(1) from is_reference where env='$ARGV[1]' ) as total from is_reference where env='$ARGV[1]'");
not the most elegant way, but solved my problem with the less lines. Regarding the delete query, i moved to another condition to check if the table have data or not.
THanks all.
You could use a stored procedure that performs those functions and returns the results of the select as well as a count, then you only need to do:
my $sth = $dbh->prepare("EXEC procedure_name ?");
$sth->execute( $ARGV[1] );
As an aside, the way you're using prepare and execute is undesirable. You use prepare to avoid having to have Perl variables directly in the query; your Perl variables should be passed to execute() as values, not part of the string given to prepare(). There are a number of good reasons to do this, including protection against SQL Injection attacks.
I also noticed oddness in your last SQL query. I think you probably want
SELECT count(env) FROM mwp.is_reference where env=?
Otherwise it will always return "1" as the count... Likewise, unless there are database triggers doing something interesting, you could combine the first and last query into one this way (I'll leave count(1) for this in case that's really what you want):
my $sth = $dbh->prepare('SELECT env,server FROM mwp.is_reference where env=?');
$sth = $sth->execute( $ARGV[1] );
my $result_set = $sth->fetchall_arrayref();
my $count = scalar #{ $result_set };
The $result_set will be a reference to an ARRAY of ARRAYRefs containing the results; $count will contain the number of rows in that result set.

How to execute query with subqueries on a table and get a Rowset object as a result in Zend?

I'm currently struggling on how to execute my query on a Table object in Zend and get a Rowset in return. Reason I need particularly THIS is because I'm modifying a code for existing project and I don't have much flexibility.
Query:
SELECT *
FROM `tblname` ud
WHERE ud.user_id = some_id
AND
(
(ud.reputation_level > 1)
OR
(
(SELECT COUNT( * )
FROM `tblname` t
WHERE t.user_id = ud.user_id
AND t.category_id <=> ud.category_id
AND t.city_id <=> ud.city_id
) = 1
)
)
Is there a way to describe this query using Select object?
Previous SQL solution was very simple and consisted of one WHERE clause:
$where = $this->getAdapter()->quoteInto("user_id = ?",$user_id);
return $this->fetchAll($where);
I need to produce same type of the result (so that it could be processed by existing code) but for more complicated query.
Things I've tried
$db = Zend_Db_Table::getDefaultAdapter();
return $db->query($sql)->fetchAll();
---------------- OR ----------------------
return $this->fetchAll($select);
---------------- OR ----------------------
return $this->_db->query($sql)->fetchAll();
But they either produce arrays instead of objects or fail with Cardinality violation message.
I would appreciate any help on how to handle SQL text queries in Zend.
$dbAdapter = Zend_Db_Table::getDefaultAdapter();
//change the fetch mode becouse you don't like the array
$dbAdapter->setFetchMode(Zend_Db::FETCH_OBJ);
$sql = "you're long sql here";
$result = $dbAdapter->fetchAll($sql);
Zend_Debug::dump($result);
exit;
For a list of all fetch modes go to Zend_Db_Adapter
To write you're query using Zend_Db_Select instead of manual string , look at Zend_Db_Slect

What is the best way to implement a substring search in SQL?

We have a simple SQL problem here. In a varchar column, we wanted to search for a string anywhere in the field. What is the best way to implement this for performance? Obviously an index is not going to help here, any other tricks?
We are using MySQL and have about 3 million records. We need to execute many of these queries per second so really trying to implement these with the best performance.
The most simple way to do this is so far is:
Select * from table where column like '%search%'
I should further specify that the column is actually a long string like "sadfasdfwerwe" and I have to search for "asdf" in this column. So they are not sentences and trying to match a word in them. Would full text search still help here?
Check out my presentation Practical Fulltext Search in MySQL.
I compared:
LIKE predicates
Regular expression predicates (no better than LIKE)
MyISAM FULLTEXT indexing
Sphinx Search
Apache Lucene
Inverted indexing
Google Custom Search Engine
Today what I would use is Apache Solr, which puts Lucene into a service with a bunch of extra features and tools.
Re your comment: Aha, okay, no. None of the fulltext search capabilities I mentioned are going to help, since they all assume some kind of word boundaries
The other way to efficiently find arbitrary substrings is the N-gram approach. Basically, create an index of all possible sequences of N letters and point to the strings where each respective sequence occurs. Typically this is done with N=3, or a trigram, because it's a point of compromise between matching longer substrings and keeping the index to a manageable size.
I don't know of any SQL database that supports N-gram indexing transparently, but you could set it up yourself using an inverted index:
create table trigrams (
trigram char(3) primary key
);
create table trigram_matches (
trigram char(3),
document_id int,
primary key (trigram, document_id),
foreign key (trigram) references trigrams(trigram),
foreign key (document_id) references mytable(document_id)
);
Now populate it the hard way:
insert into trigram_matches
select t.trigram, d.document_id
from trigrams t join mytable d
on d.textcolumn like concat('%', t.trigram, '%');
Of course this will take quite a while! But once it's done, you can search much more quickly:
select d.*
from mytable d join trigram_matches t
on t.document_id = d.document_id
where t.trigram = 'abc'
Of course you could be searching for patterns longer than three characters, but the inverted index still helps to narrow your search a lot:
select d.*
from mytable d join trigram_matches t
on t.document_id = d.document_id
where t.trigram = 'abc'
and d.textcolumn like '%abcdef%';
I you want to match whole words, look at a FULLTEXT index & MATCH() AGAINST(). And of course, take a load of your database server: cache results for a appropriate amount of time for you specific needs.
First, maybe this is an issue with a badly designed table that stores a delimited string in one field instead of correctly designing to make a related table. If this is the case, you should fix your design.
If you have a field with long descriptive text (saya a notes field) and the search is always by whole word, you can do a full-text search.
Consider if you can require your users to at least give you the first character of what they are searching for if it is an ordinary field like Last_name.
Consider doing an exact match search first and only performing the wildcard match if no results are returned. This will work if you have users who can provide exact matches. We did this once with airport name searches, it came back really fast if they put inthe exact name and slower if they did not.
If you want to search just for strings that are not words that may be somewhere in the text, you are pretty much stuck with bad performance.
mysql fulltext search's quality (for this purpose) is poor, if your language is not English
trigram search gives very good results, for this task
postgreSQL has trigram index, it's easy to use :)
but if you need to do it in mysql, try this, improved version of Bill Karwin's answer:
-each trigram is stored only once
-a simple php class uses the data
<?php
/*
# mysql table structure
CREATE TABLE `trigram2content` (
`trigram_id` int NOT NULL REFERENCES trigrams(id),
`content_type_id` int(11) NOT NULL,
`record_id` int(11) NOT NULL,
PRIMARY KEY (`content_type_id`,`trigram_id`,`record_id`)
);
#each trigram is stored only once
CREATE TABLE `trigrams` (
`id` int not null auto_increment,
`token` varchar(3) NOT NULL,
PRIMARY KEY (id),
UNIQUE token(token)
) DEFAULT CHARSET=utf8 COLLATE=utf8_bin;
SELECT count(*), record_id FROM trigrams t
inner join trigram2content c ON t.id=c.trigram_id
WHERE (
t.token IN ('loc','ock','ck ','blo',' bl', ' bu', 'bur', 'urn')
AND c.content_type_id = 0
)
GROUP by record_id
ORDER BY count(*) DESC
limit 20;
*/
class trigram
{
private $dbLink;
var $types = array(
array(0, 'name'),
array(1, 'city'));
function trigram()
{
//connect to db
$this->dbLink = mysql_connect("localhost", "username", "password");
if ($this->dbLink) mysql_select_db("dbname");
else mysql_error();
mysql_query("SET NAMES utf8;", $this->dbLink);
}
function get_type_value($type_name){
for($i=0; $i<count($this->types); $i++){
if($this->types[$i][1] == $type_name)
return $this->types[$i][0];
}
return "";
}
function getNgrams($word, $n = 3) {
$ngrams = array();
$len = mb_strlen($word, 'utf-8');
for($i = 0; $i < $len-($n-1); $i++) {
$ngrams[] = mysql_real_escape_string(mb_substr($word, $i, $n, 'utf-8'), $this->dbLink);
}
return $ngrams;
}
/**
input: array('hel', 'ell', 'llo', 'lo ', 'o B', ' Be', 'Bel', 'ell', 'llo', 'lo ', 'o ')
output: array(1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 8)
*/
private function getTrigramIds(&$t){
$u = array_unique($t);
$q = "SELECT * FROM trigrams WHERE token IN ('" . implode("', '", $u) . "')";
$query = mysql_query($q, $this->dbLink);
$n = mysql_num_rows($query);
$ids = array(); //these trigrams are already in db, they have id
$ok = array();
for ($i=0; $i<$n; $i++)
{
$row = mysql_fetch_array($query, MYSQL_ASSOC);
$ok []= $row['token'];
$ids[ $row['token'] ] = $row['id'];
}
$diff = array_diff($u, $ok); //these trigrams are not yet in the db
foreach($diff as $n){
mysql_query("INSERT INTO trigrams (token) VALUES('$n')", $this->dbLink);
$ids[$n]= mysql_insert_id();
}
//so many ids than items (if a trigram occurs more times in input, then it will occur more times in output as well)
$result = array();
foreach($t as $n){
$result[]= $ids[$n];
}
return $result;
}
function insertData($id, $data, $type){
$t = $this->getNgrams($data);
$id = intval($id);
$type = $this->get_type_value($type);
$tIds = $this->getTrigramIds($t);
$q = "INSERT INTO trigram2content (trigram_id, content_type_id, record_id) VALUES ";
$rows = array();
foreach($tIds as $n => $tid){
$rows[]= "($tid, $type, $id)";
}
$q .= implode(", ", $rows);
mysql_query($q, $this->dbLink);
}
function updateData($id, $data, $type){
mysql_query("DELETE FROM trigram2content WHERE record_id=".intval($id)." AND content_type_id=".$this->get_type_value($type), $this->dbLink);
$this->insertData($id, $data, $type);
}
function search($str, $type){
$tri = $this->getNgrams($str);
$max = count($tri);
$q = "SELECT count(*), count(*)/$max as score, record_id FROM trigrams t inner join trigram2content c ON t.id=c.trigram_id
WHERE (
t.token IN ('" . implode("', '", $tri) . "')
AND c.content_type_id = ".$this->get_type_value($type)."
)
GROUP by record_id
HAVING score >= 0.6
ORDER BY count(*) DESC
limit 20;";
$query = mysql_query($q, $this->dbLink);
$n = mysql_num_rows($query);
$result = array();
for ($i=0; $i<$n; $i++)
{
$row = mysql_fetch_array($query, MYSQL_ASSOC);
$result[] = $row;
}
return $result;
}
};
and usage:
$t = new trigram();
$t->insertData(1, "hello bello", "name");
$t->insertData(2, "hellllo Mammmma mia", "name");
print_r($t->search("helo", "name"));