renehaentjens |
09-08-2004 03:50 AM |
API to (re-)index a virtual directory
The code below shortcuts the spider, it is an API that you can call from PHP script, telling PhpDig: for URL such-and-so, please put this list of words in your tables.
It is submitted here as-is, with no guarantee for completeness or correctness. Your comments and suggestions are welcome.
Most of the code is stolen from the "real" PhpDig code, and simplified to fit my needs.
Charter, please advise me on what I should put in my PHP script that includes this code to conform to the GNU GPL.
Code:
<?php
/* The functions below inject urls and words directly into PhpDig's tables:
keywords: key_id, twoletters, keyword (lowercase, accents removed)
sites: site_id, site_url (e.g. http://xx.yy.zz/), upddate, ...
spider: spider_id, site_id, upddate, num_words, first_words,
path (e.g. uu/vv/ww/), file (e.g. index.php?sid=xxx), ...
engine: spider_id, key_id, weight
The site must already exist, table 'sites' is not touched.
Only iso-8859-1 is supported!
The code underneath the functions shows how to re-index virtual directory
$path from an array $idt containing elements $url => $text
*/
$phpDigInc = 'your_disk_and_directory.../183phpdig/includes/';
$phpDigIncCn = $phpDigInc. 'connect.php'; // to connect to PhpDig's database
$phpDigIncCw = $phpDigInc. 'common_words.txt'; // stopwords
function remove_virtual_directory($url, $path)
{
$url = addslashes($url); $path = addslashes($path);
$result = api_sql_query("SELECT site_id FROM " . PHPDIG_DB_PREFIX .
"sites WHERE site_url = '$url'", __FILE__, __LINE__); // find site
if (mysql_num_rows($result) == 1)
{
$row = mysql_fetch_array($result); $site_id = (int) $row['site_id'];
$result = api_sql_query("SELECT spider_id FROM " . PHPDIG_DB_PREFIX .
"spider WHERE site_id=" . $site_id . " AND path = '$path'",
__FILE__, __LINE__); // find all pages in virtual directory
while ($row = mysql_fetch_array($result))
{
api_sql_query("DELETE FROM " . PHPDIG_DB_PREFIX .
"engine WHERE spider_id=" . (int)$row['spider_id'],
__FILE__, __LINE__); // delete all references to keywords
$aff .= ' +' . mysql_affected_rows();
}
api_sql_query("DELETE FROM " . PHPDIG_DB_PREFIX .
"spider WHERE site_id=" . $site_id . " AND path = '$path'",
__FILE__, __LINE__); // delete all pages in virtual directory
echo htmlspecialchars(stripslashes($path)), ' (site_id ',
$site_id, '): ', mysql_affected_rows(), $aff,
' pages + word references removed from index.<br>';
return $site_id;
}
echo 'Site not in PhpDig database. Indexing is not possible...';
return FALSE;
}
function index_words($site_id, $path, $file, $first_words, $keywords)
{
global $common_words;
$path = addslashes($path); $file = addslashes($file);
$first_words = addslashes($first_words);
api_sql_query("INSERT INTO " . PHPDIG_DB_PREFIX .
"spider SET path='$path',file='$file',first_words='$first_words'," .
"site_id='$site_id'", __FILE__, __LINE__);
// do not set upddate,md5,num_words,last_modified,filesize
$spider_id = mysql_insert_id(); $new = 0;
foreach ($keywords as $key => $w)
if (strlen($key) > SMALL_WORDS_SIZE and strlen($key) <= MAX_WORDS_SIZE and
!isset($common_words[$key]) and
ereg('^['.WORDS_CHARS_LATIN1.'#$]', $key))
{
$result = api_sql_query("SELECT key_id FROM " . PHPDIG_DB_PREFIX .
"keywords WHERE keyword = '" . addslashes($key) . "'",
__FILE__, __LINE__);
if (mysql_num_rows($result) == 0)
{
api_sql_query("INSERT INTO " . PHPDIG_DB_PREFIX .
"keywords (keyword,twoletters) VALUES ('" . addslashes($key) .
"','" .addslashes(substr(str_replace('\\','',$key),0,2)) ."')",
__FILE__, __LINE__);
$key_id = mysql_insert_id(); $new++;
}
else
{
$keyid = mysql_fetch_row($result); $key_id = $keyid[0];
}
api_sql_query("INSERT INTO " . PHPDIG_DB_PREFIX .
"engine (spider_id,key_id,weight) VALUES ($spider_id,$key_id,$w)",
__FILE__, __LINE__);
}
echo '<tr><td>', $file, '</td><td>(spider_id ', $spider_id,
'):</td><td align="right">', count($keywords), ' kwds, ',
$new , ' new</td></tr>', "\n";
}
function get_first_words($text, $path, $id)
{
$db_some_text = preg_replace("/([ ]{2}|\n|\r|\r\n)/" ," ", $text);
if (strlen($db_some_text) > SUMMARY_DISPLAY_LENGTH) {
$db_some_text = substr($db_some_text, 0, SUMMARY_DISPLAY_LENGTH) . "...";
}
$titre_resume = $path . $id;
if (($psc = strpos($titre_resume, 'scorm/')) !== FALSE)
$titre_resume = substr($titre_resume, $psc + 6);
if (($pth = strpos($titre_resume, '&thumb')) !== FALSE)
$titre_resume = substr($titre_resume, 0, $pth);
return $titre_resume."\n".$db_some_text;
}
function get_keywords($text)
{
if (($token = strtok(phpdigEpureText($text), ' '))) $nbre_mots[$token] = 1;
while (($token = strtok(' ')))
$nbre_mots[$token] = ($nm = $nbre_mots[$token]) ? $nm + 1 : 1;
return $nbre_mots;
}
function phpdigEpureText($text)
{
$text = strtr(phpdigStripAccents(strtolower($text)), 'ÐÞ', 'ðþ');
$text = ereg_replace('[^'.WORDS_CHARS_LATIN1.' \'._~@#$&%/=-]+',' ',$text); // RH: was ' \'._~@#$:&%/;,=-]+', also below
$text = ereg_replace('(['.WORDS_CHARS_LATIN1.'])[\'._~@#$&%/=-]+($|[[:space:]]$|[[:space:]]['.WORDS_CHARS_LATIN1.'])','\1\2',$text);
// the next two repeated lines needed
if (SMALL_WORDS_SIZE >= 1) {
$text = ereg_replace('[[:space:]][^ ]{1,'.SMALL_WORDS_SIZE.'}[[:space:]]',' ',' '.$text.' ');
$text = ereg_replace('[[:space:]][^ ]{1,'.SMALL_WORDS_SIZE.'}[[:space:]]',' ',' '.$text.' ');
}
//$text = ereg_replace('\.+[[:space:]]|\.+$|\.{2,}',' ',$text);
$text = ereg_replace('\.{2,}',' ',$text);
$text = ereg_replace('^[[:space:]]*\.+',' ',$text);
return trim(ereg_replace("[[:space:]]+"," ",$text));
}
function phpdigStripAccents($chaine)
{
$chaine = str_replace('Æ','ae',str_replace('æ','ae',$chaine));
return strtr($chaine, 'LETTERS_WITH_ACCENTS', 'SAME_WITHOUT_ACCENTS');
}
// REAL WORK STARTS HERE ------------------------------------------------------>
require($phpDigIncCn); // switch to PhpDig DB
if (file_exists($phpDigIncCw))
if (is_array($lines = @file($phpDigIncCw)))
while (list($id,$word) = each($lines))
$common_words[trim($word)] = 1;
define('SUMMARY_DISPLAY_LENGTH', 150); // insert your own values
define('SMALL_WORDS_SIZE', 2); // ...
define('MAX_WORDS_SIZE',50); // ...
define('PHPDIG_ENCODING', 'iso-8859-1'); // a restriction of this API
define('WORDS_CHARS_LATIN1', '[:alnum:]ðþßµ');
foreach (array( 'A'=>'ÀÁÂÃÄÅ', 'a'=>'*áâãäå', 'O'=>'ÒÓÔÕÖØ', 'o'=>'òóôõöø',
'E'=>'ÈÉÊË', 'e'=>'èéêë', 'C'=>'Ç', 'c'=>'ç', 'I'=>'ÌÍÎÏ',
'i'=>'ì*îï', 'U'=>'ÙÚÛÜ', 'u'=>'ùúûü', 'Y'=>'Ý', 'y'=>'ÿý',
'N'=>'Ñ', 'n'=>'ñ') as $without => $allwith)
foreach (explode('!', chunk_split($allwith, 1, '!')) as $with)
if ($with) // because last one will be empty!
{
$letterswithout .= $without; $letterswith .= $with;
}
define('LETTERS_WITH_ACCENTS', $letterswith);
define('SAME_WITHOUT_ACCENTS', $letterswithout);
if (strlen(LETTERS_WITH_ACCENTS) == strlen(SAME_WITHOUT_ACCENTS))
if (($site_id = remove_virtual_directory('http://yoursite/', $path . '/')))
{
echo '<table>', "\n";
foreach ($idt as $url => $text)
index_words($site_id, $path, $url,
get_first_words($text, $path, $url), get_keywords($text));
echo '</table>', "\n";
}
// possible enhancement: UPDATE spider record for still existing pages
if(isset($db)) mysql_select_db($mainDbName, $db); // back to Dokeos
// eLearning platform Dokeos, see www.dokeos.com
?>
|