1. create a /lib folder under SOLR_HOME/example/solr, add the following from SIREn targets:
siren-core-0.2.3-SNAPSHOT.jar,
siren-qparser-0.2.3-SNAPSHOT.jar
siren-solr-0.2.3-SNAPSHOT.jar
2. modify solrconfig.xml, add following:
<!-- Example of Registration of the siren query parser. -->
<queryParser name="siren" class="org.sindice.siren.solr.SirenQParserPlugin"/>
<requestHandler name="siren" class="solr.StandardRequestHandler">
<!-- default values for query parameters -->
<lst name="defaults">
<str name="defType">siren</str>
<str name="echoParams">explicit</str>
<!-- Disable field query in keyword parser -->
<str name="disableField">true</str>
<str name="qf">
ntriple^1.0 url^1.2
</str>
<str name="nqf">
ntriple^1.0
</str>
<!-- the NTriple query multi-field operator:
- disjunction: the query should match in at least one of the fields
- scattered: each Ntriple patterns should match in at least on of the fields
-->
<str name="nqfo">scattered</str>
<str name="tqf">
tabular^1.0
</str>
<!-- the Tabular query multi-field operator:
- disjunction: the query should match in at least one of the fields
- scattered: each tabular patterns should match in at least on of the fields
-->
<str name="tqfo">scattered</str>
<str name="fl">
id
</str>
</lst>
</requestHandler>
3. modify schema.xml, add following and rename fields url and id if they exist in the file already.
<!-- The ID (URL) of the document
Use the 'string' field type (no tokenisation)
-->
<field name="id" type="string" indexed="true" stored="true" required="false"/>
<!-- The URL of the document
Use the 'text' field type in order to be tokenised
-->
<field name="url" type="uri" indexed="true" stored="true" required="true"/>
<!-- n-triple indexing scheme -->
<field name="ntriple" type="ntriple" indexed="true" stored="true" multiValued="false"/>
<!-- tabular indexing scheme -->
<field name="tabular" type="tabular" indexed="true" stored="false" multiValued="false"/>
<!-- A uri field that uses WhitespaceTokenizer and WordDelimiterFilter to
split URIs into multiple compoenents. Stopwords is customized by
external files.
omitNorms is true since it is a short field, and it does not make
really sense on URI.
Does not use the ASCIIFoldingExpansionFilter since URIs should not
contain accented characters.
-->
<fieldType name="uri" class="solr.TextField" omitNorms="true" positionIncrementGap="100">
<analyzer type="index">
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<!-- Splits words into subwords based on delimiters
- split subwords based on case change
- preserveOriginal="1" in order to preserve the original word.
Removed split based on numerics to fix SND-355 and SND-1283
-->
<filter class="solr.WordDelimiterFilterFactory"
generateWordParts="1"
generateNumberParts="1"
catenateWords="0"
catenateNumbers="0"
catenateAll="0"
splitOnCaseChange="1"
splitOnNumerics="0"
preserveOriginal="1"/>
<!-- Filters out those tokens *not* having length min through max
inclusive. -->
<filter class="solr.LengthFilterFactory" min="2" max="256"/>
<!-- Change to lowercase text -->
<filter class="solr.LowerCaseFilterFactory"/>
<!-- Case insensitive stop word removal.
add enablePositionIncrements=true in both the index and query
analyzers to leave a 'gap' for more accurate phrase queries.
-->
<filter class="solr.StopFilterFactory"
ignoreCase="true"
words="stopwords.txt"
enablePositionIncrements="true"
/>
</analyzer>
<analyzer type="query">
<!-- whitespace tokenizer to not tokenize URI -->
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<!-- Filters out those tokens *not* having length min through max
inclusive. -->
<filter class="solr.LengthFilterFactory" min="2" max="256"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.StopFilterFactory"
ignoreCase="true"
words="stopwords.txt"
enablePositionIncrements="true"
/>
<!-- Replace Qnames by their name spaces in URIs. -->
<filter class="org.sindice.siren.solr.analysis.QNamesFilterFactory"
qnames="qnames.txt"/>
</analyzer>
</fieldType>
<!--
The SIREn field type:
The top-level analyzers must be defined in the top-level analyzer
configuration file (ntriple-analyzers.xml) and the datatype analyzers in
the datatype analyzer configuration file (ntriples-datatypes.xml).
Field norms are not useful for SIREn fields. Set omitNorms to true reduces
memory consumption, and improve ranking.
omitTermFreqAndPositions *must* be set to false.
-->
<fieldType name="ntriple" class="org.sindice.siren.solr.schema.SirenField"
omitNorms="true"
omitTermFreqAndPositions="false"
analyzerConfig="tuple-analyzers.xml"
datatypeConfig="tuple-datatypes.xml"/>
<fieldType name="tabular" class="org.sindice.siren.solr.schema.SirenField"
omitNorms="true"
omitTermFreqAndPositions="false"
analyzerConfig="tuple-analyzers.xml"
datatypeConfig="tuple-datatypes.xml"/>
<similarity class="org.sindice.siren.similarity.SirenSimilarity"/>
4. copy the following files from SIREN_HOME/siren_solr/example/solr/config to SOLR_HOME/example/solr/config
tuple-analyzers.xmltuple-datatypes.xml
qnames.txt
5. Restart default Jetty in Solr by java -jar start.jar
6. test with sample code in SIREN_HOME/siren_solr/example/
The examples are indexed successfully but the queries return no result.P.S. SIREn doesn't support SPARQL.
sources:
- https://github.com/rdelbru/SIREn/blob/master/siren-solr/example/INSTALL.txt
Thanks for sharing this, which is quite useful. However, I am a little puzzled by the last words "The examples are indexed successfully but the queries return no result. P.S. SIREn doesn't support SPARQL."
ReplyDeleteSorry for my ignorance, but does it mean the queries are not working because of SPARQL? Thanks for the clarification.