How can I index files on my local filesystem? (VB.NET)


Search Lite for ASP.NET is designed primarily to be a web-site indexer, and includes a web-site spider/crawler that visits web-sites from a users perspective.  This product can also search documents on local drives using the file-system.  To do this, we need to programmatically add documents to the index, and then build the index as usual.
1) You will scan directories to identify files that should be indexed

2) You'll programmatically add these files to the index

3) You'll build the index programmatically and then you can run searches (programmatically or from the web control)

In the following code we reference our demo project "Programmatic_VB.NET" which is installed with the main product, this gives you the simple framework to run builds and searches within.

Now go through 1, 2 and 3 from above:

1. Scanning directories for documents to add, in this code we will search for all PDF and Word docs.

i) add a button to the form in the demo and attach an event handler for it's click

ii) add this method

Public Sub FindDocs(ByVal theFoundItems As ArrayList, ByVal theRoot As String)

Dim theDirectories() As String = System.IO.Directory.GetDirectories(theRoot)
For Each theDirectory As String In theDirectories
FindDocs(theFoundItems, theDirectory)

' Now scan the files in the current directory for any that match...
Dim theFiles() As String = System.IO.Directory.GetFiles(theRoot)

For Each theFile As String In theFiles
If theFile.ToLower().IndexOf(".pdf") > -1 Or theFile.ToLower().IndexOf(".doc") > -1 Then    'PDF & DOC FILE FILTER
End If

End Sub


and use it from your button click

Private Sub Button1_Click(ByVal sender As System.Object, ByVal e As System.EventArgs) Handles Button1.Click
Dim docs As New ArrayList
FindDocs(docs, "C:\Inetpub\wwwroot")
'docs now has paths to all Doc and PDF files under wwwroot (careful, this method can take a long time)

End Sub

2. Add the docs to the index by modifying the Button1_Click to include

Private Sub Button1_Click(ByVal sender As System.Object, ByVal e As System.EventArgs) Handles Button1.Click
Dim docs As New ArrayList
FindDocs(docs, "C:\Inetpub\wwwroot")
'docs now has paths to all Doc and PDF files under wwwroot (careful, this method can take a long time)
'add the docs to the index
Keyoti.SearchEngine.Configuration.xmlLocation = indexDir

Dim doc As String
For Each doc In docs
indexer.AddDocument(New Keyoti.SearchEngine.Documents.Document(New Keyoti.SearchEngine.DataAccess.DocumentRecord(New Uri(doc))))
Catch ex As Exception
MessageBox.Show(("An error occurred: " + ex.Message))
End Try
MessageBox.Show("Recursive find ended")

'call UpdateKnownDocs() to refresh list of docs in index
End Sub

3. Now the index can be built, in the demo this is already taken care of with the Build Index button, it just calls

Sub RunBuild()
builderFinished = False
Keyoti.SearchEngine.Configuration.xmlLocation = indexDir

Catch e As Exception
MessageBox.Show(("An error occurred: " + e.Message))
End Try

builderFinished = True
Me.stopBuildBT.Enabled = False
Me.indexBT.Enabled = True
MessageBox.Show("Build ended")
End Sub

That will add all the docs/pdfs it can find to the index, once you've built the index you can run searches programmatically (see demo) or using the SearchResult control by pointing it's IndexDirectory property at the directory where you built this index.

1. recursively searching an entire hard drive can take time, so limit paths as much as possible
Complete modified code from demo project (just replace Form1 in the demo with this):

Public Class Form1

Inherits System.Windows.Forms.Form


Dim spider As New Keyoti.SearchEngine.Index.WebSiteSpider

Dim indexer As New Keyoti.SearchEngine.Index.DocumentIndex

Dim indexDir As String = "..\IndexDirectory"

Dim crawlerFinished, builderFinished As Boolean




#Region "Crawl"

Private Sub crawlBT_Click(ByVal sender As Object, ByVal e As System.EventArgs) Handles crawlBT.Click

'start crawl the thread

Dim crawlThread As New System.Threading.Thread(New System.Threading.ThreadStart(AddressOf RunCrawl))


'start the progress thread

Dim crawlProgressThread As New System.Threading.Thread(New System.Threading.ThreadStart(AddressOf RunCrawlUpdater))


Me.stopCrawlBT.Enabled = True

Me.crawlBT.Enabled = False

End Sub 'crawlBT_Click



Sub RunCrawl()

crawlerFinished = False

Keyoti.SearchEngine.Configuration.xmlLocation = indexDir



spider.Crawl(New ArrayList(New String() {Me.crawlURLTB.Text}))


Catch e As Exception

MessageBox.Show(("An error occurred: " + e.Message))

End Try

crawlerFinished = True

Me.stopCrawlBT.Enabled = False

Me.crawlBT.Enabled = True

MessageBox.Show("Crawl ended")


End Sub 'RunCrawl


'updates the UI with crawl progress

Sub RunCrawlUpdater()

While Not crawlerFinished


Me.crawlProgress.Text = spider.NewLinkNo & " new of " & spider.ProcessedLinkNo & " links"

End While

End Sub 'RunCrawlUpdater


'stops the crawl

Private Sub stopCrawlBT_Click(ByVal sender As Object, ByVal e As System.EventArgs) Handles stopCrawlBT.Click

Keyoti.SearchEngine.Index.WebSiteSpider.cancelCrawl = True

End Sub 'stopCrawlBT_Click

#End Region

#Region "Delete"

Private Sub deleteIndexBT_Click(ByVal sender As Object, ByVal e As System.EventArgs) Handles deleteIndexBT.Click

Me.deleteIndexBT.Enabled = False

Me.deleteIndexBT.Text = "Deleting..."

Dim delThread As New System.Threading.Thread(New System.Threading.ThreadStart(AddressOf RunDelete))


End Sub 'deleteIndexBT_Click




Sub RunDelete()

Keyoti.SearchEngine.Configuration.xmlLocation = indexDir

Dim ind As New Keyoti.SearchEngine.Index.DocumentIndex


Dim docs As ArrayList = ind.GetIndexedDocuments()

Dim doc As Keyoti.SearchEngine.Documents.Document

For Each doc In docs


Next doc


Me.deleteIndexBT.Enabled = True

Me.deleteIndexBT.Text = "Delete"

MessageBox.Show("Index deleted")


End Sub 'RunDelete

#End Region

#Region "Build Index"

Private Sub indexBT_Click(ByVal sender As Object, ByVal e As System.EventArgs) Handles indexBT.Click

'start build the thread

Dim buildThread As New System.Threading.Thread(New System.Threading.ThreadStart(AddressOf RunBuild))


'start the progress thread

Dim buildProgressThread As New System.Threading.Thread(New System.Threading.ThreadStart(AddressOf RunBuildUpdater))


Me.stopBuildBT.Enabled = True

Me.indexBT.Enabled = False

End Sub 'indexBT_Click



Sub RunBuild()

builderFinished = False

Keyoti.SearchEngine.Configuration.xmlLocation = indexDir





Catch e As Exception

MessageBox.Show(("An error occurred: " + e.Message))

End Try

builderFinished = True

Me.stopBuildBT.Enabled = False

Me.indexBT.Enabled = True

MessageBox.Show("Build ended")

End Sub 'RunBuild


'updates the UI with build progress

Sub RunBuildUpdater()

While Not builderFinished


If indexer.Progress < 100 Then

buildProgress.Text = indexer.Progress & "% built"


buildProgress.Text = indexer.OccurrenceCount & " indexed - saving..."

End If

End While

buildProgress.Text = "Finished"

End Sub 'RunBuildUpdater


Private Sub stopBuildBT_Click(ByVal sender As Object, ByVal e As System.EventArgs) Handles stopBuildBT.Click

Keyoti.SearchEngine.Index.DocumentIndex.cancelBuild = True

End Sub 'stopBuildBT_Click

#End Region

#Region "Search"

Private Sub searchBT_Click(ByVal sender As Object, ByVal e As System.EventArgs) Handles searchBT.Click

Keyoti.SearchEngine.Configuration.xmlLocation = indexDir


Dim resultsOutput As String = ""

Dim sa As New Keyoti.SearchEngine.Search.SearchAgent

sa.QueryString = Me.queryTB.Text

sa.LicenseKey = Me.licenseTB.Text

'get first 50 results only

Dim results As Keyoti.SearchEngine.Search.SearchResult = sa.Search(1, 50)

Dim item As Keyoti.SearchEngine.Search.ResultItem

For Each item In results

resultsOutput += item.URIString + ControlChars.Cr + ControlChars.Lf + ControlChars.Cr + ControlChars.Lf

Next item

If results.Count = 0 Then

resultsOutput = "No results found"

End If

Me.resultsTB.Text = resultsOutput

Catch ex As Exception

MessageBox.Show(("An error occurred: " + ex.Message))

End Try

End Sub 'searchBT_Click

#End Region

#Region "E.t.c."

Private Sub checkBox1_CheckedChanged(ByVal sender As Object, ByVal e As System.EventArgs) Handles checkBox1.CheckedChanged

Keyoti.SearchEngine.Configuration.logging = Me.checkBox1.Checked

End Sub 'checkBox1_CheckedChanged


Sub UpdateKnownDocs()

Keyoti.SearchEngine.Configuration.xmlLocation = indexDir


Dim output As String = ""


Dim docs As ArrayList = indexer.GetIndexedDocuments()

Dim doc As Keyoti.SearchEngine.Documents.Document

For Each doc In docs

output += doc.URI.ToString() + ControlChars.Cr + ControlChars.Lf + ControlChars.Cr + ControlChars.Lf

Next doc


If docs.Count = 0 Then

output = "No documents in index"

End If

Me.knownDocsTB.Text = output

Catch e As Exception

MessageBox.Show(("An error occurred: " + e.Message))

End Try

End Sub 'UpdateKnownDocs

#End Region


Sub on_load(ByVal sender As Object, ByVal e As System.EventArgs) Handles MyBase.Load


End Sub

<STAThread()> _

Shared Sub Main()


Application.Run(New Form1)

End Sub


Public Sub FindDocs(ByVal theFoundItems As ArrayList, ByVal theRoot As String)

Dim theDirectories() As String = System.IO.Directory.GetDirectories(theRoot)

For Each theDirectory As String In theDirectories

FindDocs(theFoundItems, theDirectory)


' Now scan the files in the current directory for any that match...

Dim theFiles() As String = System.IO.Directory.GetFiles(theRoot)

For Each theFile As String In theFiles

If theFile.ToLower().IndexOf(".pdf") > -1 Or theFile.ToLower().IndexOf(".doc") > -1 Then


End If


End Sub






Private Sub Button1_Click(ByVal sender As System.Object, ByVal e As System.EventArgs) Handles Button1.Click

Dim docs As New ArrayList

FindDocs(docs, "C:\Inetpub\wwwroot")

'docs now has paths to all Doc and PDF files under wwwroot (careful, this method can take a long time)

'add the docs to the index

Keyoti.SearchEngine.Configuration.xmlLocation = indexDir



Dim doc As String

For Each doc In docs

indexer.AddDocument(New Keyoti.SearchEngine.Documents.Document(New Keyoti.SearchEngine.DataAccess.DocumentRecord(New Uri(doc))))



Catch ex As Exception

MessageBox.Show(("An error occurred: " + ex.Message))

End Try

MessageBox.Show("Recursive find ended")

'call UpdateKnownDocs() to refresh list of docs in index


End Sub

End Class

