views:

22

answers:

1

This is the table.

CREATE TABLE `posts` (
  `post_id` int(20) NOT NULL AUTO_INCREMENT,
  `post_archived` enum('Y','N') COLLATE utf8_unicode_ci NOT NULL DEFAULT 'N',
  `post_updatedts` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
  `post_author` int(20) NOT NULL DEFAULT '0',
  `post_date` timestamp NOT NULL DEFAULT '0000-00-00 00:00:00',
  `post_pub_date` timestamp NOT NULL DEFAULT '0000-00-00 00:00:00',
  `post_status` varchar(32) COLLATE utf8_unicode_ci NOT NULL DEFAULT 'processing',
  `post_type` varchar(32) COLLATE utf8_unicode_ci DEFAULT NULL,
  `post_category` int(20) NOT NULL DEFAULT '1',
  `post_tags` text COLLATE utf8_unicode_ci,
  `post_title` text COLLATE utf8_unicode_ci,
  `post_orig_url` varchar(255) COLLATE utf8_unicode_ci DEFAULT NULL,
  `post_domain` varchar(255) COLLATE utf8_unicode_ci DEFAULT NULL,
  `post_url` varchar(255) COLLATE utf8_unicode_ci DEFAULT NULL,
  `post_content` text COLLATE utf8_unicode_ci,
  `post_votes_up` smallint(11) NOT NULL DEFAULT '0',
  `post_votes_down` smallint(11) NOT NULL DEFAULT '0',
  `post_comments` enum('open','closed') COLLATE utf8_unicode_ci NOT NULL DEFAULT 'open',
  `post_subscribe` tinyint(1) NOT NULL DEFAULT '0',
  `post_updateby` int(20) NOT NULL DEFAULT '0',
  `post_siteid` int(20) NOT NULL DEFAULT '1',
  PRIMARY KEY (`post_id`),
  KEY `post_siteid` (`post_siteid`),
  KEY `post_archived` (`post_archived`),
  KEY `post_status` (`post_status`),
  KEY `post_type` (`post_type`),
  FULLTEXT KEY `post_title` (`post_title`,`post_domain`,`post_url`,`post_content`,`post_tags`)
) ENGINE=MyISAM AUTO_INCREMENT=4 DEFAULT CHARSET=utf8 COLLATE=utf8_unicode_ci COMMENT='Story Posts'

This is some sample data I am working on:

INSERT INTO `posts` VALUES (1, 'N', '2010-7-26 19:47:10', 1, '2010-7-26 15:08:56', '0000-0-0 00:00:00', 'new', 'news', 3, '%CE%BA%CE%B1%CE%BB%CF%8D%CF%84%CE%B5%CF%81%CE%BF%2C+%CE%B1%CF%8D%CF%84%CE%BF', '%26Lambda%3B%CE%AD%26tau%3B%26epsilon%3B+%26nu%3B%26alpha%3B+%26kappa%3B%CE%AC%26nu%3B%26omicron%3B%26upsilon%3B%26mu%3B%26epsilon%3B+%26kappa%3B%CE%AC%26tau%3B%26iota%3B+%26gamma%3B%26iota%3B%26alpha%3B+%26kappa%3B%CE%AC%26tau%3B%26iota%3B+%CE%AC%26lambda%3B%26lambda%3B%26omicron%3B+%26alpha%3B%26nu%3B%26tau%3B%CE%AF+%26alpha%3B%26upsilon%3B%26tau%3B%CF%8C%3B', 'http%3A%2F%2Flocalhost%2Fdev%2Fhotarucms%2Findex.php%3Fpage%3D1', 'http%3A%2F%2Flocalhost', '%CE%BB%CE%AD%CF%84%CE%B5-%CE%BD%CE%B1-%CE%BA%CE%AC%CE%BD%CE%BF%CF%85%CE%BC%CE%B5-%CE%BA%CE%AC%CF%84%CE%B9-%CE%B3%CE%B9%CE%B1-%CE%BA%CE%AC%CF%84%CE%B9-%CE%AC%CE%BB%CE%BB%CE%BF-%CE%B1%CE%BD%CF%84%CE%AF', '%CE%A4%CE%B9+%CE%B8%CE%B1+%CE%BB%CE%AD%CE%B3%CE%B1%CF%84%CE%B5+%CE%BD%CE%B1+%CE%BA%CE%AC%CE%BD%CE%BF%CF%85%CE%BC%CE%B5+%CE%BA%CE%AC%CF%84%CE%B9+%CE%AC%CE%BB%CE%BB%CE%BF+%CE%B1%CE%BD%CF%84%CE%AF+%CE%B1%CF%85%CF%84%CF%8C+%CF%80%CE%BF%CF%85+%CE%B8%CE%AD%CE%BB%CE%B5%CF%84%CE%B5+%CE%BD%CE%B1+%CE%BA%CE%AC%CE%BD%CE%BF%CF%85%CE%BC%CE%B5%3B%0D%0A%CE%91%CE%BD+%CE%BA%CE%AC%CE%BD%CE%BF%CF%85%CE%BC%CE%B5+%CE%B1%CF%85%CF%84%CF%8C%2C+%CE%B8%CE%B1+%CF%80%CF%81%CE%AD%CF%80%CE%B5%CE%B9+%CE%BD%CE%B1+%CE%BC%CE%B9%CE%BB%CE%AE%CF%83%CE%BF%CF%85%CE%BC%CE%B5+%CE%B3%CE%B9%CE%B1+%CE%BA%CE%AC%CF%84%CE%B9+%CE%AC%CE%BB%CE%BB%CE%BF+%CE%B1%CE%BD%CF%84%CE%AF+%CE%B1%CF%85%CF%84%CF%8C.%0D%0A%0D%0A%CE%A9%CF%83%CF%84%CF%8C%CF%83%CE%BF%2C+%CE%B8%CE%B1+%CE%B5%CE%AF%CE%BD%CE%B1%CE%B9+%CE%BA%CE%B1%CE%BB%CF%8D%CF%84%CE%B5%CF%81%CE%B1+%CE%BD%CE%B1+%CE%BA%CE%AC%CE%BD%CE%BF%CF%85%CE%BC%CE%B5+%CF%84%CE%BF+%CE%AC%CE%BB%CE%BB%CE%BF+%CE%B1%CE%BD%CF%84%CE%AF+%CE%BD%CE%B1+%CE%BC%CE%B9%CE%BB%CE%AC%CE%BC%CE%B5+%CE%B3%CE%B9%CE%B1+%CE%B1%CF%85%CF%84%CF%8C+%CE%BA%CE%B1%CE%B9+%CE%AD%CF%84%CF%83%CE%B9+%CE%B8%CE%B1+%CE%B5%CE%AF%CE%BD%CE%B1%CE%B9+%CF%8C%CE%BB%CE%B1+%CF%80%CE%BF%CE%BB%CF%8D+%CE%BA%CE%B1%CE%BB%CF%8D%CF%84%CE%B5%CF%81%CE%B1+%CE%B1%CF%86%CE%BF%CF%8D+%CE%B1%CF%85%CF%84%CF%8C+%CE%B5%CE%AF%CE%BD%CE%B1%CE%B9+%CE%BA%CE%B1%CE%BB%CF%8D%CF%84%CE%B5%CF%81%CE%BF+%CE%B1%CF%80%CE%BF+%CF%84%CE%BF+%CE%AC%CE%BB%CE%BB%CE%BF.', 1, 0, 'open', 0, 1, 1);
INSERT INTO `posts` VALUES (2, 'N', '2010-7-27 13:16:41', 1, '2010-7-27 13:00:02', '0000-0-0 00:00:00', 'new', 'news', 5, 'some%2C+nice', 'nice+going', 'http%3A%2F%2Flocalhost%2Fdev%2Fhotarucms%2Fvarious%2Fnice-going%2F', 'http%3A%2F%2Flocalhost', 'nice-going', 'This+is+a+good+cms.+I+hope+it+makes+it+big.+It+should+be+a+good+thing+to+make+it+nicer+and+more+friendly+to+unicode.', 0, 0, 'open', 0, 1, 1);
INSERT INTO `posts` VALUES (3, 'N', '2010-7-28 08:07:45', 2, '2010-7-28 08:07:36', '0000-0-0 00:00:00', 'new', 'news', 4, '%CE%B5%CF%81%CF%89%CF%84%CE%AE%CF%83%CE%B5%CE%B9%CF%82%2C+%CE%B1%CF%80%CE%B1%CE%BD%CF%84%CE%AE%CF%83%CE%B5%CE%B9%CF%82', '%26Mu%3B%26pi%3B%26omicron%3B%26rho%3B%26epsilon%3B%CE%AF+%26nu%3B%26alpha%3B+%26mu%3B%26omicron%3B%26upsilon%3B+%26alpha%3B%26rho%3B%CE%AD%26sigma%3B%26epsilon%3B%26iota%3B', 'http%3A%2F%2Flocalhost%2Fdev%2Fhotarucms%2F%CF%83%CF%84%CE%B1%CF%84%CE%B9%CE%BA%CE%AC%2F%CE%BC%CF%80%CE%BF%CF%81%CE%B5%CE%AF-%CE%BD%CE%B1-%CE%BC%CE%BF%CF%85-%CE%B1%CF%81%CE%AD%CF%83%CE%B5%CE%B9%2F', 'http%3A%2F%2Flocalhost', '%CE%BC%CF%80%CE%BF%CF%81%CE%B5%CE%AF-%CE%BD%CE%B1-%CE%BC%CE%BF%CF%85-%CE%B1%CF%81%CE%AD%CF%83%CE%B5%CE%B9', '%CE%98%CE%B1+%CE%B4%CE%BF%CF%8D%CE%BC%CE%B5+%CF%84%CE%B9+%CE%BC%CF%80%CE%BF%CF%81%CE%B5%CE%AF+%CE%BD%CE%B1+%CE%B3%CE%AF%CE%BD%CE%B5%CE%B9+%CE%B1%CF%86%CE%BF%CF%8D+%CE%B4%CE%B5%CE%BD+%CE%BE%CE%AD%CF%81%CF%89+%CE%B1%CE%BA%CF%8C%CE%BC%CE%B7+%CE%B1%CE%BD+%CE%B5%CE%AF%CE%BD%CE%B1%CE%B9+%CE%B4%CF%85%CE%BD%CE%B1%CF%84%CF%8C+%CE%BD%CE%B1+%CF%84%CE%BF+%CE%BA%CE%BF%CE%B9%CF%84%CE%AC%CE%BE%CF%89.+%CE%98%CE%B1+%CF%80%CF%81%CE%AD%CF%80%CE%B5%CE%B9+%CE%BD%CE%B1+%CF%80%CF%81%CE%BF%CF%87%CF%89%CF%81%CE%AE%CF%83%CE%BF%CF%85%CE%BC%CE%B5+%CE%BC%CE%B5+%CF%83%CF%8D%CE%BD%CE%B5%CF%83%CE%B7+%CE%BA%CE%B1%CE%B9+%CE%B1%CF%80%CE%BF%CF%84%CE%B5%CE%BB%CE%B5%CF%83%CE%BC%CE%B1%CF%84%CE%B9%CE%BA%CF%8C%CF%84%CE%B7%CF%84%CE%B1.+%CE%9D%CE%B1+%CE%BF%CF%80%CE%BB%CE%B9%CF%83%CF%84%CE%BF%CF%8D%CE%BC%CE%B5+%CE%BC%CE%B5+%CF%85%CF%80%CE%BF%CE%BC%CE%BF%CE%BD%CE%AE+%CE%BA%CE%B1%CE%B9+%CE%BA%CE%BF%CF%85%CF%81%CE%AC%CE%B3%CE%B9%CE%BF+%CE%B3%CE%B9%CE%B1+%CE%BD%CE%B1+%CF%80%CF%81%CE%BF%CF%87%CF%89%CF%81%CE%AE%CF%83%CE%BF%CF%85%CE%BC%CE%B5+%CE%B1%CF%81%CE%B3%CF%8C%CF%84%CE%B5%CF%81%CE%B1+%CF%83%CE%B5+%CE%AD%CE%BD%CE%B1+%CE%BC%CE%AD%CE%BB%CE%BB%CE%BF%CE%BD+%CF%80%CE%B9%CE%BF+%CE%BA%CE%B1%CE%BB%CF%8C.', 1, 0, 'open', 0, 2, 1);

This query works as expected, giving one result.

SELECT * FROM posts WHERE MATCH( post_title, post_domain, post_url, post_content, post_tags ) AGAINST ('%26upsilon%3B+%26alpha' IN BOOLEAN MODE)

While this one fails to return anything than an empty result set.

SELECT * FROM posts WHERE MATCH( post_title, post_domain, post_url, post_content, post_tags ) AGAINST ('%CE%B5%CF%81%CF%89%CF%84%CE%AE%CF%83%CE%B5%CE%B9%CF%82%2C' IN BOOLEAN MODE)

I am baffled because it seems that including a proper string of characters in the string constant for AGAINST makes is work while using solely a Urlencoded string on its own does nothing.

Anyone come across this ?

+2  A: 

http://dev.mysql.com/doc/refman/5.0/en/fulltext-natural-language.html

The MySQL FULLTEXT implementation regards any sequence of true word characters (letters, digits, and underscores) as a word. That sequence may also contain apostrophes (“'”), but not more than one in a row.

So: the first probably looks for '26upsilon' & '26alpha' (no 2B as it's below the minimum word length), %s are not considered part of a word. Your second search treats '%'s as a separator, which results in all 'words' mysql recognizes being under the minimum word length. Urldecode your contents if possible, or use LIKE for specifix searches.

An illustration:

CREATE TABLE f (t varchar(255), FULLTEXT INDEX(t));
INSERT INTO f values('%foobar%fozbar%foxbar%');
INSERT INTO f values('%fo%fz%fb%');

SELECT * FROM f WHERE match(t) AGAINST ('foobar%foxbar' IN BOOLEAN MODE);
1 Result, so % is a word delimiter
SELECT * FROM f WHERE match(t) AGAINST ('fo%fz' IN BOOLEAN MODE);
O results (minimum under 4 chars
SELECT * FROM f WHERE match(t) AGAINST ('"fo%fz"' IN BOOLEAN MODE);
O results (minimum under 4 chars)
SELECT * FROM f WHERE match(t) AGAINST ('+fo +fz' IN BOOLEAN MODE);
O results (still that pesky minimum)
Wrikken
Is there any way to change this behavior for %'s ?
petsagouris
There are possiblities, look at http://dev.mysql.com/doc/refman/5.0/en/fulltext-fine-tuning.html, especially the stuff about characters that are considered word characters and/or ft_min_word_len. Whatever you do, it would be a kludge though: as with so many 'database-problems', you should save the actual unencoded data, either as seperate column, or in the original column, and do the encoding logic in your application rather then storing it encoded in the database.
Wrikken
I though so, but just needed to see if there was another way.Nevertheless thanks for the answer.
petsagouris