关于搜索引擎的一些源代码--688IT编程网

关于搜索引擎的一些源代码

//第一个版本应该保存body和title，搜索结果形成超链接，不显示正文。

protected void Button1_Click(object sender, EventArgs e)

{

string indexPath = "c:/index";//设置索引文件保存的路径

//Directory表示索引文件(Lucene用来保存用户扔过来的数据的地方)保存的地方

//是抽象类，两个子类FSDirectory（文件中）、RAMDire ctory （内存中）。indexpath表示索引的文件夹路径

FSDirectory directory = FSDirectory.Open(new DirectoryInfo(indexPath), new NativeFSLockFactory());

//IndexReader的静态方法bool IndexExists(Directory directory)判断目录directory是否是一个索引目录。

bool isUpdate = IndexReader.IndexExists(directory);

if (isUpdate)//假如该目录存在

{

//如果索引目录被锁定（比如索引过程中程序异常退出），则首先解锁

if (IndexWriter.IsLocked(directory))

{

//在对目录写之前会先把目录锁定。两个IndexWriter没法同时写一个索引文件。IndexWriter在进行写操作的时候会自动加锁，close的时候会自动解锁

IndexWriter.Unlock(directory);

}

//IndexReader对索引进行读取的类，对IndexWriter进行写的类

IndexWriter writer = new IndexWriter(directory, new PanGuAnalyzer(), !isUpdate, Lucene.Net.Index.IndexWriter.MaxFieldLength.UNLIMITED);

WebClient wc = new WebClient();

wc.Encoding = Encoding.UTF8;//否则下载的是乱码

//todo：读取rss，获得第一个item中的链接的编号部分就是最大的帖子编号

int maxId = GetMaxId();

for (int i = 1; i <= maxId; i++)

{

//依次获得地址

string url = "-" + i + ".aspx";

//根据地址依次将对应的该网页下载下来

string html = wc.DownloadString(url);

//将各个文档解析

HTMLDocumentClass doc = new HTMLDocumentClass();

doc.designMode = "on"; //不让解析引擎去尝试运行javascript

doc.IHTMLDocument2_write(html);

doc.close();

string title = doc.title;

有没有什么网站分享源码string body = doc.body.innerText;//去掉标签

//为避免重复索引，所以先删除number=i的记录，再重新添加

writer.DeleteDocuments(new Term("number",i.ToString()));

Document document = new Document();

//只有对需要全文检索的字段才ANALYZED

document.Add(new Field("number", i.ToString(), Field.Store.YES, Field.Index.NOT_ANALYZED));

document.Add(new Field("title", title, Field.Store.YES, Field.Index.NOT_ANALYZED));

document.Add(new Field("body", body, Field.Store.YES, Field.Index.ANALYZED, Lucene.Net.Documents.Field.TermVector.WITH_POSITIONS_OFFSETS));

writer.AddDocument(document);

logger.Debug("索引" + i + "完毕");

}

writer.Close();

directory.Close();//不要忘了Close，否则索引结果搜不到

logger.Debug("全部索引完毕");

}

protected void Button2_Click(object sender, EventArgs e)

{

string indexPath = "c:/index";

string kw = TextBox1.Text;

FSDirectory directory = FSDirectory.Open(new DirectoryInfo(indexPath), new NoLockFactory());

IndexReader reader = IndexReader.Open(directory, true);

IndexSearcher searcher = new IndexSearcher(reader);

PhraseQuery query = new PhraseQuery();

//todo:把用户输入的关键词进行拆词

foreach (string word in CommonHelper.SplitWord(TextBox1.Text))//先用空格，让用户去分词，空格分隔的就是词“计算机专业”

{

query.Add(new Term("body", word));

}

//query.Add(new Term("body","计算机"));

//query.Add(new Term("body", "专业"));

query.SetSlop(100);

TopScoreDocCollector collector = ate(1000, true);

searcher.Search(query, null, collector);

ScoreDoc[] docs = collector.TopDocs(0, collector.GetTotalHits()).scoreDocs;

List<SearchResult> listResult = new List<SearchResult>();

for (int i = 0; i < docs.Length; i++)

{

int docId = docs[i].doc;//取到文档的编号（主键，这个是Lucene 分配的）

//检索结果中只有文档的id，如果要取Document，则需要Doc再去取

//降低内容占用

Document doc = searcher.Doc(docId);//根据idDocument

string number = doc.Get("number");

string title = doc.Get("title");

string body = doc.Get("body");

SearchResult result = new SearchResult();

result.Number = number;

result.Title = title;

result.BodyPreview = Preview(body,TextBox1.Text);

listResult.Add(result);

}

repeaterResult.DataSource = listResult;

repeaterResult.DataBind();

}

private static string Preview(string body,string keyword)

{

//创建HTMLFormatter,参数为高亮单词的前后缀

PanGu.HighLight.SimpleHTMLFormatter simpleHTMLFormatter =

new PanGu.HighLight.SimpleHTMLFormatter("<font color=\"red\">", "</font>");

//创建 Highlighter ，输入HTMLFormatter 和盘古分词对象Semgent

PanGu.HighLight.Highlighter highlighter =

new PanGu.HighLight.Highlighter(simpleHTMLFormatter,

new Segment());

//设置每个摘要段的字符数

highlighter.FragmentSize = 100;

//获取最匹配的摘要段

String bodyPreview = highlighter.GetBestFragment(keyword, body);

return bodyPreview;

}

private int GetMaxId()

{

XDocument xdoc = XDocument.Load("");

XElement channel = xdoc.Root.Element("channel");

XElement firstItem = channel.Elements("item").First();

XElement link = firstItem.Element("link");

Match match =

Regex.Match(link.Value, @"showtopic-(\d+)\.aspx");

string id =match.Groups[1].Value;

688IT编程网

关于搜索引擎的一些源代码

发表评论

推荐文章

java正则表达式选择题

一种基于正则表达式的DBC文件解析及报文分析方法[发明专利]

工龄小数点提取

非零金额正则表达式

提取文本中数字的函数

热门文章

excel文字递增函数公式

数字递增公式

notepad 正则变量运算

C++regex库常用函数及实例

js正则表达式之前瞻后顾与非捕获分组

indesign正则数字和英文之间的空格

C#匹配中文字符串的4种正则表达式分享

PHP正则表达式匹配中文字符

匹配中文汉字的正则表达式介绍

Python正则表达式如何进行字符串替换

orcl中用正则表达式

sql正则表达式excel

dataframe正则表达式

postgress sql正则

el-upload accept 正则表达式

半小时正则表达式

判断科学计数法的正则

根据url判断静态资源的方法

Java正则表达式-匹配正负浮点数

替换模糊匹配正则-hive

最新文章

一种基于正则表达式的DBC文件解析及报文分析方法[发明专利]

能被5整除的十进制整数的正规表达式

大于0小于等于1的正则表达式

linux grep 26个字母

java pattern 正则表达式

掌握文本编辑器中的搜索和替换技巧

标签列表

688IT编程网

关于搜索引擎的一些源代码

发表评论

推荐文章

java正则表达式 选择题

一种基于正则表达式的DBC文件解析及报文分析方法[发明专利]

工龄小数点提取

非零金额 正则表达式

提取文本中数字的函数

热门文章

excel文字递增函数公式

数字递增公式

notepad 正则变量运算

C++regex库常用函数及实例

js正则表达式之前瞻后顾与非捕获分组

indesign正则数字和英文之间的空格

C#匹配中文字符串的4种正则表达式分享

PHP正则表达式匹配中文字符

匹配中文汉字的正则表达式介绍

Python正则表达式如何进行字符串替换

orcl中用正则表达式

sql正则表达式excel

dataframe正则表达式

postgress sql正则

el-upload accept 正则表达式

半小时 正则表达式

判断科学计数法的正则

根据url判断静态资源的方法

Java正则表达式-匹配正负浮点数

替换模糊匹配正则-hive

最新文章

一种基于正则表达式的DBC文件解析及报文分析方法[发明专利]

能被5整除的十进制整数的正规表达式

大于0小于等于1的正则表达式

linux grep 26个字母

java pattern 正则表达式

掌握文本编辑器中的搜索和替换技巧

标签列表

java正则表达式选择题

非零金额正则表达式

半小时正则表达式