Hey Jim,
I think I may have found a small bug.
We are using some of the code found on here (and altered it) that allows us to index Meta Keywords and descriptions. When inserting the meta data, we have site specific variables that allow the client to adjust the weighting of the meta data.
Everything works perfectly as long as we leave the BoostFactorTagName to the Keyoti default "keyoti_search_weight_boost_factor". If I look at the index, meta words are boosted by the correct factor.
Once we change the default (in the configuration.xml) and insert the new tag at run-time, it no longer boost the meta data (defaulting back to 10*1).
Please let me know how to resolve the issue!
Cheers,
-Dan
Code below altered as follows:
- Constructor passes in 2 new values
- metaKeyWordWeight --> int for boost factor
- metaDescriptionWeight --> int for boost factor
- reference KeyotiVariables.BoostFactorTagName for the reference to the configuration BoostFactorTagName
Code:
public class ExtendedHtmlDocumentParser : Keyoti.SearchEngine.Documents.HtmlDocumentParser
{
private static string keywordStartMark = " <!--{0}=\"{1}\"--> ";
private static string endMark = " <!--{0}=\"1\"--> ";
private static string descStartMark = " <!--{0}=\"{1}\"--> ";
public ExtendedHtmlDocumentParser(Configuration c, int metaKeyWordWeight, int metaDescriptionWeight) : base(c)
{
keywordStartMark = string.Format(keywordStartMark, KeyotiVariables.BoostFactorTagName , metaKeyWordWeight);
descStartMark = string.Format(descStartMark,KeyotiVariables.BoostFactorTagName, metaDescriptionWeight);
endMark = string.Format(endMark, KeyotiVariables.BoostFactorTagName);
}
public override DocumentText Read(System.IO.Stream stream, Uri uri, Encoding encoding)
{
//To read the meta tags, we need a copy of the document.
MemoryStream peakStream = CopyStream(stream);
StreamReader peakReader = new StreamReader(peakStream);
string documentContent = peakReader.ReadToEnd();
peakReader.Close();
//Now read the meta tags
Hashtable _metaTable = ReadMetaTags(documentContent);
//Add meta contents to a new stream
MemoryStream modifiedStream = new MemoryStream();
StreamWriter modWriter = new StreamWriter(modifiedStream);
modWriter.Write(documentContent + " ");
if (_metaTable["keywords"] != null)
{
modWriter.Write(keywordStartMark);
modWriter.Write(_metaTable["keywords"].ToString());
modWriter.Write(endMark);
Keyoti.SearchEngine.DataAccess.Log.WriteLogEntry("Plug-in", "Meta keywords:" + _metaTable["keywords"].ToString(), Configuration);
}
if (_metaTable["description"] != null)
{
modWriter.Write(descStartMark);
modWriter.Write(_metaTable["description"].ToString());
modWriter.Write(endMark);
Keyoti.SearchEngine.DataAccess.Log.WriteLogEntry("Plug-in", "Meta description:" + _metaTable["description"].ToString(), Configuration);
}
//Reset the stream
modWriter.Flush();
modifiedStream.Position = 0;
//And do the usual parsing based on the new stream
return base.Read(modifiedStream, uri, encoding);
}
MemoryStream CopyStream(Stream stream)
{
MemoryStream memStream = new MemoryStream(8192);
int c;
byte[] buf = new byte[4096];
while ((c = stream.Read(buf, 0, buf.Length)) > 0)
{
memStream.Write(buf, 0, c);
}
memStream.Position = 0;
stream.Close();
return memStream;
}
Hashtable ReadMetaTags(string documentBody)
{
Hashtable _metaTable = new Hashtable();
Regex metaReg = new Regex("<\\s*meta[^>]*name=[\"']?([^\"'\\s]*)[^>]*content=[\"']?([^\"']*)", RegexOptions.IgnoreCase);
MatchCollection matches = metaReg.Matches(documentBody);
if (matches != null)
{
foreach (Match m in matches)
{
Group name = m.Groups[1];
Group content = m.Groups[2];
if (name != null && content != null && !_metaTable.ContainsKey(name.Value.ToLower()))
_metaTable.Add(name.Value.ToLower(), content.Value);
}
}
return _metaTable;
}
}