Version: 5.5.0
Windows 10
Jdk 1.8
PUT /index
{
"settings": {
"analysis": {
"analyzer": {
"ngram_analyzer": {
"tokenizer": "ngram_tokenizer"
}
},
"tokenizer": {
"ngram_tokenizer": {
"type": "ngram",
"min_gram": 1,
"max_gram": 6000
}
}
}
},
"mappings": {
"bigdata": {
"dynamic_templates": [
{
"textMapping_no_analyzer_sore": {
"match":"*_no_analyzer_sore",
"mapping": {
"type": "keyword",
"store": "true",
"analyzer": "not_analyzed"
}
}
},
{
"textMapping_analyzer_sore": {
"match":"*_analyzer_sore",
"mapping": {
"type": "text",
"store": "true",
"analyzer": "ngram_analyzer"
}
}
}
我使用es自带 ngram分词,建立数据的时候就报如下错误
[2017-07-22T15:04:01,829][WARN ][o.e.i.c.IndicesClusterStateService] [5GI70mm] [[index][3]] marking and sending shard failed due to [shard failure, reason [already closed by tragic event on the index writer]]
java.lang.ArrayIndexOutOfBoundsException: -65536
at org.apache.lucene.index.TermsHashPerField.writeByte(TermsHashPerField.java:196) ~[lucene-core-6.6.0.jar:6.6.0 5c7a7b65d2aa7ce5ec96458315c661a18b320241 - ishan - 2017-05-30 07:29:46]
at org.apache.lucene.index.TermsHashPerField.writeVInt(TermsHashPerField.java:219) ~[lucene-core-6.6.0.jar:6.6.0 5c7a7b65d2aa7ce5ec96458315c661a18b320241 - ishan - 2017-05-30 07:29:46]
at org.apache.lucene.index.FreqProxTermsWriterPerField.writeProx(FreqProxTermsWriterPerField.java:80) ~[lucene-core-6.6.0.jar:6.6.0 5c7a7b65d2aa7ce5ec96458315c661a18b320241 - ishan - 2017-05-30 07:29:46]
at org.apache.lucene.index.FreqProxTermsWriterPerField.newTerm(FreqProxTermsWriterPerField.java:120) ~[lucene-core-6.6.0.jar:6.6.0 5c7a7b65d2aa7ce5ec96458315c661a18b320241 - ishan - 2017-05-30 07:29:46]
at org.apache.lucene.index.TermsHashPerField.add(TermsHashPerField.java:176) ~[lucene-core-6.6.0.jar:6.6.0 5c7a7b65d2aa7ce5ec96458315c661a18b320241 - ishan - 2017-05-30 07:29:46]
at org.apache.lucene.index.DefaultIndexingChain$PerField.invert(DefaultIndexingChain.java:796) ~[lucene-core-6.6.0.jar:6.6.0 5c7a7b65d2aa7ce5ec96458315c661a18b320241 - ishan - 2017-05-30 07:29:46]
at org.apache.lucene.index.DefaultIndexingChain.processField(DefaultIndexingChain.java:447) ~[lucene-core-6.6.0.jar:6.6.0 5c7a7b65d2aa7ce5ec96458315c661a18b320241 - ishan - 2017-05-30 07:29:46]
at org.apache.lucene.index.DefaultIndexingChain.processDocument(DefaultIndexingChain.java:403) ~[lucene-core-6.6.0.jar:6.6.0 5c7a7b65d2aa7ce5ec96458315c661a18b320241 - ishan - 2017-05-30 07:29:46]
at org.apache.lucene.index.DocumentsWriterPerThread.updateDocument(DocumentsWriterPerThread.java:232) ~[lucene-core-6.6.0.jar:6.6.0 5c7a7b65d2aa7ce5ec96458315c661a18b320241 - ishan - 2017-05-30 07:29:46]
at org.apache.lucene.index.DocumentsWriter.updateDocument(DocumentsWriter.java:478) ~[lucene-core-6.6.0.jar:6.6.0 5c7a7b65d2aa7ce5ec96458315c661a18b320241 - ishan - 2017-05-30 07:29:46]
at org.apache.lucene.index.IndexWriter.updateDocument(IndexWriter.java:1571) ~[lucene-core-6.6.0.jar:6.6.0 5c7a7b65d2aa7ce5ec96458315c661a18b320241 - ishan - 2017-05-30 07:29:46]
at org.apache.lucene.index.IndexWriter.addDocument(IndexWriter.java:1316) ~[lucene-core-6.6.0.jar:6.6.0 5c7a7b65d2aa7ce5ec96458315c661a18b320241 - ishan - 2017-05-30 07:29:46]
at org.elasticsearch.index.engine.InternalEngine.index(InternalEngine.java:661) ~[elasticsearch-5.5.0.jar:5.5.0]
at org.elasticsearch.index.engine.InternalEngine.indexIntoLucene(InternalEngine.java:605) ~[elasticsearch-5.5.0.jar:5.5.0]
at org.elasticsearch.index.engine.InternalEngine.index(InternalEngine.java:505) ~[elasticsearch-5.5.0.jar:5.5.0]
at org.elasticsearch.index.shard.IndexShard.index(IndexShard.java:556) ~[elasticsearch-5.5.0.jar:5.5.0]
at org.elasticsearch.index.shard.IndexShard.index(IndexShard.java:545) ~[elasticsearch-5.5.0.jar:5.5.0]
at org.elasticsearch.action.bulk.TransportShardBulkAction.executeIndexRequestOnPrimary(TransportShardBulkAction.java:484) ~[elasticsearch-5.5.0.jar:5.5.0]
at org.elasticsearch.action.bulk.TransportShardBulkAction.executeBulkItemRequest(TransportShardBulkAction.java:143) ~[elasticsearch-5.5.0.jar:5.5.0]
at org.elasticsearch.action.bulk.TransportShardBulkAction.shardOperationOnPrimary(TransportShardBulkAction.java:113) ~[elasticsearch-5.5.0.jar:5.5.0]
at org.elasticsearch.action.bulk.TransportShardBulkAction.shardOperationOnPrimary(TransportShardBulkAction.java:69) ~[elasticsearch-5.5.0.jar:5.5.0]
at org.elasticsearch.action.support.replication.TransportReplicationAction$PrimaryShardReference.perform(TransportReplicationAction.java:939) ~[elasticsearch-5.5.0.jar:5.5.0]
at org.elasticsearch.action.support.replication.TransportReplicationAction$PrimaryShardReference.perform(TransportReplicationAction.java:908) ~[elasticsearch-5.5.0.jar:5.5.0]
at org.elasticsearch.action.support.replication.ReplicationOperation.execute(ReplicationOperation.java:113) ~[elasticsearch-5.5.0.jar:5.5.0]
at org.elasticsearch.action.support.replication.TransportReplicationAction$AsyncPrimaryAction.onResponse(TransportReplicationAction.java:322) ~[elasticsearch-5.5.0.jar:5.5.0]
at org.elasticsearch.action.support.replication.TransportReplicationAction$AsyncPrimaryAction.onResponse(TransportReplicationAction.java:264) ~[elasticsearch-5.5.0.jar:5.5.0]
at org.elasticsearch.action.support.replication.TransportReplicationAction$1.onResponse(TransportReplicationAction.java:888) ~[elasticsearch-5.5.0.jar:5.5.0]
at org.elasticsearch.action.support.replication.TransportReplicationAction$1.onResponse(TransportReplicationAction.java:885) ~[elasticsearch-5.5.0.jar:5.5.0]
at org.elasticsearch.index.shard.IndexShardOperationsLock.acquire(IndexShardOperationsLock.java:147) ~[elasticsearch-5.5.0.jar:5.5.0]
at org.elasticsearch.index.shard.IndexShard.acquirePrimaryOperationLock(IndexShard.java:1657) ~[elasticsearch-5.5.0.jar:5.5.0]
at org.elasticsearch.action.support.replication.TransportReplicationAction.acquirePrimaryShardReference(TransportReplicationAction.java:897) ~[elasticsearch-5.5.0.jar:5.5.0]
at org.elasticsearch.action.support.replication.TransportReplicationAction.access$400(TransportReplicationAction.java:93) ~[elasticsearch-5.5.0.jar:5.5.0]
at org.elasticsearch.action.support.replication.TransportReplicationAction$AsyncPrimaryAction.doRun(TransportReplicationAction.java:281) ~[elasticsearch-5.5.0.jar:5.5.0]
at org.elasticsearch.common.util.concurrent.AbstractRunnable.run(AbstractRunnable.java:37) ~[elasticsearch-5.5.0.jar:5.5.0]
at org.elasticsearch.action.support.replication.TransportReplicationAction$PrimaryOperationTransportHandler.messageReceived(TransportReplicationAction.java:260) ~[elasticsearch-5.5.0.jar:5.5.0]
at org.elasticsearch.action.support.replication.TransportReplicationAction$PrimaryOperationTransportHandler.messageReceived(TransportReplicationAction.java:252) ~[elasticsearch-5.5.0.jar:5.5.0]
at org.elasticsearch.transport.RequestHandlerRegistry.processMessageReceived(RequestHandlerRegistry.java:69) ~[elasticsearch-5.5.0.jar:5.5.0]
at org.elasticsearch.transport.TransportService$7.doRun(TransportService.java:644) ~[elasticsearch-5.5.0.jar:5.5.0]
at org.elasticsearch.common.util.concurrent.ThreadContext$ContextPreservingAbstractRunnable.doRun(ThreadContext.java:638) ~[elasticsearch-5.5.0.jar:5.5.0]
at org.elasticsearch.common.util.concurrent.AbstractRunnable.run(AbstractRunnable.java:37) ~[elasticsearch-5.5.0.jar:5.5.0]
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142) [?:1.8.0_111]
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617) [?:1.8.0_111]
at java.lang.Thread.run(Thread.java:745) [?:1.8.0_111]
Windows 10
Jdk 1.8
PUT /index
{
"settings": {
"analysis": {
"analyzer": {
"ngram_analyzer": {
"tokenizer": "ngram_tokenizer"
}
},
"tokenizer": {
"ngram_tokenizer": {
"type": "ngram",
"min_gram": 1,
"max_gram": 6000
}
}
}
},
"mappings": {
"bigdata": {
"dynamic_templates": [
{
"textMapping_no_analyzer_sore": {
"match":"*_no_analyzer_sore",
"mapping": {
"type": "keyword",
"store": "true",
"analyzer": "not_analyzed"
}
}
},
{
"textMapping_analyzer_sore": {
"match":"*_analyzer_sore",
"mapping": {
"type": "text",
"store": "true",
"analyzer": "ngram_analyzer"
}
}
}
我使用es自带 ngram分词,建立数据的时候就报如下错误
[2017-07-22T15:04:01,829][WARN ][o.e.i.c.IndicesClusterStateService] [5GI70mm] [[index][3]] marking and sending shard failed due to [shard failure, reason [already closed by tragic event on the index writer]]
java.lang.ArrayIndexOutOfBoundsException: -65536
at org.apache.lucene.index.TermsHashPerField.writeByte(TermsHashPerField.java:196) ~[lucene-core-6.6.0.jar:6.6.0 5c7a7b65d2aa7ce5ec96458315c661a18b320241 - ishan - 2017-05-30 07:29:46]
at org.apache.lucene.index.TermsHashPerField.writeVInt(TermsHashPerField.java:219) ~[lucene-core-6.6.0.jar:6.6.0 5c7a7b65d2aa7ce5ec96458315c661a18b320241 - ishan - 2017-05-30 07:29:46]
at org.apache.lucene.index.FreqProxTermsWriterPerField.writeProx(FreqProxTermsWriterPerField.java:80) ~[lucene-core-6.6.0.jar:6.6.0 5c7a7b65d2aa7ce5ec96458315c661a18b320241 - ishan - 2017-05-30 07:29:46]
at org.apache.lucene.index.FreqProxTermsWriterPerField.newTerm(FreqProxTermsWriterPerField.java:120) ~[lucene-core-6.6.0.jar:6.6.0 5c7a7b65d2aa7ce5ec96458315c661a18b320241 - ishan - 2017-05-30 07:29:46]
at org.apache.lucene.index.TermsHashPerField.add(TermsHashPerField.java:176) ~[lucene-core-6.6.0.jar:6.6.0 5c7a7b65d2aa7ce5ec96458315c661a18b320241 - ishan - 2017-05-30 07:29:46]
at org.apache.lucene.index.DefaultIndexingChain$PerField.invert(DefaultIndexingChain.java:796) ~[lucene-core-6.6.0.jar:6.6.0 5c7a7b65d2aa7ce5ec96458315c661a18b320241 - ishan - 2017-05-30 07:29:46]
at org.apache.lucene.index.DefaultIndexingChain.processField(DefaultIndexingChain.java:447) ~[lucene-core-6.6.0.jar:6.6.0 5c7a7b65d2aa7ce5ec96458315c661a18b320241 - ishan - 2017-05-30 07:29:46]
at org.apache.lucene.index.DefaultIndexingChain.processDocument(DefaultIndexingChain.java:403) ~[lucene-core-6.6.0.jar:6.6.0 5c7a7b65d2aa7ce5ec96458315c661a18b320241 - ishan - 2017-05-30 07:29:46]
at org.apache.lucene.index.DocumentsWriterPerThread.updateDocument(DocumentsWriterPerThread.java:232) ~[lucene-core-6.6.0.jar:6.6.0 5c7a7b65d2aa7ce5ec96458315c661a18b320241 - ishan - 2017-05-30 07:29:46]
at org.apache.lucene.index.DocumentsWriter.updateDocument(DocumentsWriter.java:478) ~[lucene-core-6.6.0.jar:6.6.0 5c7a7b65d2aa7ce5ec96458315c661a18b320241 - ishan - 2017-05-30 07:29:46]
at org.apache.lucene.index.IndexWriter.updateDocument(IndexWriter.java:1571) ~[lucene-core-6.6.0.jar:6.6.0 5c7a7b65d2aa7ce5ec96458315c661a18b320241 - ishan - 2017-05-30 07:29:46]
at org.apache.lucene.index.IndexWriter.addDocument(IndexWriter.java:1316) ~[lucene-core-6.6.0.jar:6.6.0 5c7a7b65d2aa7ce5ec96458315c661a18b320241 - ishan - 2017-05-30 07:29:46]
at org.elasticsearch.index.engine.InternalEngine.index(InternalEngine.java:661) ~[elasticsearch-5.5.0.jar:5.5.0]
at org.elasticsearch.index.engine.InternalEngine.indexIntoLucene(InternalEngine.java:605) ~[elasticsearch-5.5.0.jar:5.5.0]
at org.elasticsearch.index.engine.InternalEngine.index(InternalEngine.java:505) ~[elasticsearch-5.5.0.jar:5.5.0]
at org.elasticsearch.index.shard.IndexShard.index(IndexShard.java:556) ~[elasticsearch-5.5.0.jar:5.5.0]
at org.elasticsearch.index.shard.IndexShard.index(IndexShard.java:545) ~[elasticsearch-5.5.0.jar:5.5.0]
at org.elasticsearch.action.bulk.TransportShardBulkAction.executeIndexRequestOnPrimary(TransportShardBulkAction.java:484) ~[elasticsearch-5.5.0.jar:5.5.0]
at org.elasticsearch.action.bulk.TransportShardBulkAction.executeBulkItemRequest(TransportShardBulkAction.java:143) ~[elasticsearch-5.5.0.jar:5.5.0]
at org.elasticsearch.action.bulk.TransportShardBulkAction.shardOperationOnPrimary(TransportShardBulkAction.java:113) ~[elasticsearch-5.5.0.jar:5.5.0]
at org.elasticsearch.action.bulk.TransportShardBulkAction.shardOperationOnPrimary(TransportShardBulkAction.java:69) ~[elasticsearch-5.5.0.jar:5.5.0]
at org.elasticsearch.action.support.replication.TransportReplicationAction$PrimaryShardReference.perform(TransportReplicationAction.java:939) ~[elasticsearch-5.5.0.jar:5.5.0]
at org.elasticsearch.action.support.replication.TransportReplicationAction$PrimaryShardReference.perform(TransportReplicationAction.java:908) ~[elasticsearch-5.5.0.jar:5.5.0]
at org.elasticsearch.action.support.replication.ReplicationOperation.execute(ReplicationOperation.java:113) ~[elasticsearch-5.5.0.jar:5.5.0]
at org.elasticsearch.action.support.replication.TransportReplicationAction$AsyncPrimaryAction.onResponse(TransportReplicationAction.java:322) ~[elasticsearch-5.5.0.jar:5.5.0]
at org.elasticsearch.action.support.replication.TransportReplicationAction$AsyncPrimaryAction.onResponse(TransportReplicationAction.java:264) ~[elasticsearch-5.5.0.jar:5.5.0]
at org.elasticsearch.action.support.replication.TransportReplicationAction$1.onResponse(TransportReplicationAction.java:888) ~[elasticsearch-5.5.0.jar:5.5.0]
at org.elasticsearch.action.support.replication.TransportReplicationAction$1.onResponse(TransportReplicationAction.java:885) ~[elasticsearch-5.5.0.jar:5.5.0]
at org.elasticsearch.index.shard.IndexShardOperationsLock.acquire(IndexShardOperationsLock.java:147) ~[elasticsearch-5.5.0.jar:5.5.0]
at org.elasticsearch.index.shard.IndexShard.acquirePrimaryOperationLock(IndexShard.java:1657) ~[elasticsearch-5.5.0.jar:5.5.0]
at org.elasticsearch.action.support.replication.TransportReplicationAction.acquirePrimaryShardReference(TransportReplicationAction.java:897) ~[elasticsearch-5.5.0.jar:5.5.0]
at org.elasticsearch.action.support.replication.TransportReplicationAction.access$400(TransportReplicationAction.java:93) ~[elasticsearch-5.5.0.jar:5.5.0]
at org.elasticsearch.action.support.replication.TransportReplicationAction$AsyncPrimaryAction.doRun(TransportReplicationAction.java:281) ~[elasticsearch-5.5.0.jar:5.5.0]
at org.elasticsearch.common.util.concurrent.AbstractRunnable.run(AbstractRunnable.java:37) ~[elasticsearch-5.5.0.jar:5.5.0]
at org.elasticsearch.action.support.replication.TransportReplicationAction$PrimaryOperationTransportHandler.messageReceived(TransportReplicationAction.java:260) ~[elasticsearch-5.5.0.jar:5.5.0]
at org.elasticsearch.action.support.replication.TransportReplicationAction$PrimaryOperationTransportHandler.messageReceived(TransportReplicationAction.java:252) ~[elasticsearch-5.5.0.jar:5.5.0]
at org.elasticsearch.transport.RequestHandlerRegistry.processMessageReceived(RequestHandlerRegistry.java:69) ~[elasticsearch-5.5.0.jar:5.5.0]
at org.elasticsearch.transport.TransportService$7.doRun(TransportService.java:644) ~[elasticsearch-5.5.0.jar:5.5.0]
at org.elasticsearch.common.util.concurrent.ThreadContext$ContextPreservingAbstractRunnable.doRun(ThreadContext.java:638) ~[elasticsearch-5.5.0.jar:5.5.0]
at org.elasticsearch.common.util.concurrent.AbstractRunnable.run(AbstractRunnable.java:37) ~[elasticsearch-5.5.0.jar:5.5.0]
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142) [?:1.8.0_111]
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617) [?:1.8.0_111]
at java.lang.Thread.run(Thread.java:745) [?:1.8.0_111]
4 个回复
kennywu76 - Wood
赞同来自: nainc
我查了下资料,Lucene对于一个生成的token的大小上限是32k,然后分析文本的时候,用于存放token的内存缓冲区大小最大是2048MB, 因此缓冲区最多能缓存65536个token。 如果你输入的文本超级大,max_gram也超级大,那么分析出来的uniq token会超过缓冲区大小,从而抛出上面的错误。
一方面ngram需要对min -> max gram的范围做限制,一般就是几个。 另外一方面,ngram一般不用于对大段的文本做分析,他主要的用途是分析邮政编码,商品编码一类的数据。 大段文本应该使用其他分词器。
hubiao
赞同来自:
如果这种不可取,那我想要
文本:北京天安门
分词效果:
北
北京
北京天
北京天安
北京天安门
京
京天
京天安
京天安门
天
天安
天安门
安
安门
门
这样的分词效果,还有其他类似于 ngram,用于处理大文本的分词器吗?
kennywu76 - Wood
赞同来自:
整段的大文本不适合这样处理,可以考虑使用ik一类的中文分词器。 但是此时搜索的时候是无法达到类似数据库里的like %xyz%这样的效果的,并且对于长文本,搜索应用通常也不是用like %%这样的方式检索,需要转变一下思维方式。
建议仔细阅读一下: https://www.elastic.co/guide/c ... .html
hubiao
赞同来自: