solr全文检索一

发表: 2016-12-14 浏览: 1869

1.环境配置

jdk1.8

solr6.3.0

apache-tomcat-8.0.38

windows OS

2.server的jetty启动方式

cd D:\soft\worksoft\solr-6.3.0\

D:\soft\worksoft\solr-6.3.0>bin\solr.cmd -p 8983

Waiting up to 30 to see Solr running on port 8983

Started Solr server on port 8983. Happy searching!

web访问确认：

Clipboard Image.png

关闭服务：

D:\soft\worksoft\solr-6.3.0>bin\solr.cmd stop -p 8983

Stopping Solr process 4376 running on port 8983

3.配置core

1.拷贝solr-6.3.0\example\example-DIH\solr\solr到solr-6.3.0\server\solr,并把solr重命名为hbase。说明：如果是数据库类型，可以拷贝solr-6.3.0\example\example-DIH\solr\db目录

2.managed-schema.xml内容

<?xml version="1.0" encoding="UTF-8" ?>



<schema name="hbaseEntity-solr" version="1.6">

  

	<!-- 保留字段，不能删除，否则报错   --> 

	<field name="_version_" type="long" indexed="true" stored="true"/>

	<field name="_root_" type="string" indexed="true" stored="false"/>



	<!-- name:字段名

		type:定义的各种FieldType

		indexed:是否索引

		stored:是否存储

	-->   

	 <!-- 自定义mmseg4j分词类型 -->

        <field name="jobId" type="mmseg4j_simple" indexed="true" stored="true" required="true" multiValued="false" /> 

	<field name="submitted" type="string" indexed="true" stored="true" required="true" multiValued="false" />

	<field name="duration" type="string" indexed="true" stored="true" required="true" multiValued="false" />

	<field name="stages" type="string" indexed="true" stored="true" required="true" multiValued="false" />

	<field name="tasks" type="string" indexed="true" stored="true" required="true" multiValued="false" />	

	<field name="description" type="string" indexed="true" stored="true" required="true" multiValued="false" />	

	<field name="url" type="string" indexed="true" stored="true" required="true" multiValued="false" />	



	<!-- 建立拷贝字段，将所有的全文本复制到一个字段中，进行统一的检索   --> 

	<field name="text" type="text_general" indexed="true" stored="false" multiValued="true" />		



	<!-- 动态字段，没有具体名称的字段  -->



	<!-- 文档唯一标示   -->

	<uniqueKey>jobId</uniqueKey>



	<!-- 拷贝需要的字段 -->

	<copyField source="submitted" dest="text"/>

	<copyField source="url" dest="text"/>





	<!--fieldType 定义 -->



	<!-- 自定义ik分词   https://code.google.com/p/ik-analyzer/downloads/list 选择solr对应包-->

	 <fieldType name="text_ik" class="solr.TextField">     

		<analyzer type="index">  

			<!--   

				IKTokenizerFactory：继承 TokenizerFactory    

				useSmart：是否启用 智能分词  

			-->  

			<tokenizer class="org.wltea.analyzer.lucene.IKTokenizerFactory" useSmart="false" />  

			<!-- 

				StopFilterFactory：停止分词,会根据stopwords.txt中配置的文件停止分词 

			-->  

			<filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" />  

		</analyzer>  

		<analyzer type="query">  

			<tokenizer class="org.wltea.analyzer.lucene.IKTokenizerFactory" useSmart="true" />  

			<filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" />  

		</analyzer>  

	</fieldType>  	

	<!-- 

	<fieldType name="text_ik" class="solr.TextField" positionIncrementGap="100">

	  <analyzer type="index" isMaxWordLength="false" class="org.wltea.analyzer.lucene.IKAnalyzer" />

	  <analyzer type="query" isMaxWordLength="true" class="org.wltea.analyzer.lucene.IKAnalyzer" />

	</fieldType>

	-->



	<!-- mmseg4j分词    下载（ https://code.google.com/p/mmseg4j/downloads/list ）并解压mmseg4j-1.9.1.zip,在mmseg4j-1.9.0前， 则需要copy data目录到solr_home/solr中（与core平级），并改名为dic,mmseg4j-core-1.9.0.jar 里的 words.dic,可以不必使用dicPath-->

	<fieldType name="mmseg4j_complex" class="solr.TextField" positionIncrementGap="100" >  

		<analyzer>  

			<tokenizer class="com.chenlb.mmseg4j.solr.MMSegTokenizerFactory" mode="complex" dicPath="dic"/>  

		</analyzer>  

	</fieldType>  

	<fieldType name="mmseg4j_max_word" class="solr.TextField" positionIncrementGap="100" >  

		<analyzer>  

			<tokenizer class="com.chenlb.mmseg4j.solr.MMSegTokenizerFactory" mode="max-word" dicPath="dic"/>  

		</analyzer>  

	</fieldType>  

	<fieldType name="mmseg4j_simple" class="solr.TextField" positionIncrementGap="100" >  

		<analyzer>  

		  <!--

			<tokenizer class="com.chenlb.mmseg4j.solr.MMSegTokenizerFactory" mode="simple" dicPath="n:/OpenSource/apache-solr-1.3.0/example/solr/my_dic"/> 

			-->

			<tokenizer class="com.chenlb.mmseg4j.solr.MMSegTokenizerFactory" mode="simple" dicPath="dic"/>	 

		</analyzer>  

	</fieldType>



	<!-- solr自带的smartcn分词器 -->

	<fieldType name="text_smart"class="solr.TextField" positionIncrementGap="100">

		<analyzer type="index">        

			<tokenizerclasstokenizerclass="solr.SmartChineseSentenceTokenizerFactory"/>                       

			<filterclassfilterclass="solr.SmartChineseWordTokenFilterFactory"/>    

		</analyzer>

		<analyzer type="query">

			<tokenizerclasstokenizerclass="solr.SmartChineseSentenceTokenizerFactory"/>  

			<filterclassfilterclass="solr.SmartChineseWordTokenFilterFactory"/>           

		</analyzer>  

	</fieldType>  





	<!-- 同义词配置 -->

	<fieldType name="text_general" class="solr.TextField" positionIncrementGap="100">

	  <analyzer type="index">

		<tokenizer class="solr.StandardTokenizerFactory"/>

		<filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" />

		<!-- in this example, we will only use synonyms at query time

		<filter class="solr.SynonymFilterFactory" synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/>

		-->

		<filter class="solr.LowerCaseFilterFactory"/>

	  </analyzer>

	  <analyzer type="query">

		<tokenizer class="solr.StandardTokenizerFactory"/>

		<filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" />

		<filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>

		<filter class="solr.LowerCaseFilterFactory"/>

	  </analyzer>

	</fieldType>



	<!-- 评分权重配置 -->

	 <similarity class="com.example.solr.CustomSimilarityFactory">

	   <str name="paramkey">param value</str>

	 </similarity>

</schema>

其他选择默认配置,并将mmseg4j-analysis-1.9.1，mmseg4j-core-1.9.1，mmseg4j-solr-1.9.1 jar包拷贝到solr-6.3.0\server\solr-webapp\webapp\WEB-INF\lib目录。

异常报错1：

Caused by: org.apache.solr.common.SolrException: Multiple [schema.xml] fieldType registered to the same name: text_mmseg4j ignoring: text_mmseg4j{class=org.apache.solr.schema.TextField,analyzer=org.apache.solr.analysis.TokenizerChain,args=null}

是因为我在配置mmseg4j分词的时候，命名都相同导致，改为对应mmseg4j-simple,complex,max_word正常。

异常报错2：

java.lang.NoSuchMethodError: org.apache.solr.core.SolrResourceLoader.getInstanceDir()Ljava/lang/String;

	at com.chenlb.mmseg4j.solr.Utils.getDict(Utils.java:18)

是因为配置mmseg4j的dicPath路径出问题，配置绝对路径正常。

0 个评论

要回复文章请先登录或注册