書名: 自己動手寫分布式搜索引擎作者名: 羅剛本章字數: 294字更新時間: 2020-11-28 15:52:47
3.2.11 寫索引集成到爬蟲
爬蟲把抓取的信息寫入索引:
public class IndexDao { private IndexWriter indexWriter; public IndexDao(){ try { Directory directory = FSDirectory.open(new File("d:/lietu/index")); Analyzer analyzer = new StandardAnalyzer(); indexWriter = new IndexWriter(directory, analyzer, MaxFieldLength.LIMITED); } catch (IOException e) { e.printStackTrace(); } } public void save(GoodsInfo goodsInfo){ Document doc = goodsInfo2Document(goodsInfo); try{ indexWriter.addDocument(doc); }catch(Exception e){ e.printStackTrace(); } } public void close(){ try { indexWriter.close(); } catch (Exception e) { e.printStackTrace(); } } public Document goodsInfo2Document(GoodsInfo ti) { Document doc = new Document(); Field f = new Field("url", ti.getGoodsNameURL(), Field.Store.YES, Field.Index.NOT_ANALYZED, Field.TermVector.NO); doc.add(f); f = new Field("title", ti.getGoodsName(), Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS); doc.add(f); if (ti.getGoodsDescription() ! = null) { f = new Field("body", ti.getGoodsDescription(), Field.Store.YES, Field.Index.NOT_ANALYZED, Field.TermVector.NO); doc.add(f); } f = new Field("date", DateTools.dateToString(new Date(), DateTools.Resolution.DAY), Field.Store.YES, Field.Index.NOT_ANALYZED, Field.TermVector.NO); doc.add(f); f = new Field("priceInt", String.valueOf(ti.getPriceInteger()), Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS); doc.add(f); if (ti.getMoneyUnit() ! = null) { f = new Field("moneyUnit", ti.getMoneyUnit(), Field.Store.YES, Field.Index.NOT_ANALYZED, Field.TermVector.NO); doc.add(f); } try { URL website = new URL(ti.getGoodsNameURL().toString()); f = new Field("fromwebsite", website.getHost(), Field.Store.YES, Field.Index.NOT_ANALYZED, Field.TermVector.NO); doc.add(f); } catch (MalformedURLException e1) { System.out.println("error url =" + ti.getGoodsNameURL().toString()); e1.printStackTrace(); } // 分類 f = new Field("category", ti.getGoodsType(), Field.Store.YES, Field.Index.NOT_ANALYZED, Field.TermVector.NO); doc.add(f); // img if (ti.getImage() ! = null) { f = new Field("img", ti.getImage(), Field.Store.YES, Field.Index.NOT_ANALYZED, Field.TermVector.NO); doc.add(f); } // 制造廠名稱 if (ti.getMfrName() ! = null) { f = new Field("brand", ti.getMfrName(), Field.Store.YES, Field.Index.NOT_ANALYZED, Field.TermVector.NO); doc.add(f); } // 商品型號 序列號 if (ti.getMfrNumber() ! = null) { f = new Field("type", ti.getMfrNumber(), Field.Store.YES, Field.Index.NOT_ANALYZED, Field.TermVector.NO); doc.add(f); } // 價格 if (ti.getGoodsPrice() ! = null) { f = new Field("price", ti.getGoodsPrice(), Field.Store.YES, Field.Index.NOT_ANALYZED, Field.TermVector.NO); doc.add(f); } return doc; } }
推薦閱讀
- 性能測試從零開始
- Beginning Swift
- Expert Cube Development with Microsoft SQL Server 2008 Analysis Services
- 新編AutoCAD 2016從入門到精通
- UG NX 12.0中文版從入門到精通
- AutoCAD 2022中文版完全自學一本通
- iPad Procreate風格繪畫之美
- NX Open API編程技術
- 從零開始:Indesign CC 2019設計基礎+商業設計實戰
- KNIME視覺化數據分析
- Photoshop & Illustrator平面設計火星課堂
- Photoshop后期強:多重曝光專業技法寶典
- Joomla! 1.5 JavaScript jQuery
- Photoshop攝影后期必修5項核心技法 基本調整+光影校正+色彩修飾+局部處理+銳化降噪
- OpenCms 7 Development