public class HTMLNoiseReducer
extends java.lang.Object
プロパティ名 | 説明 | デフォルト値/必須 |
---|---|---|
rcss.html.noise.reducer.min.score | 類似と判定される最小スコア | DEF_MIN_SCORE |
rcss.html.noise.reducer.removable.tags | カンマ区切りで列挙された削除単位HTMLタグ | DEF_TAGS |
FileIOHTMLNoiseReducer
修飾子とタイプ | クラスと説明 |
---|---|
static class |
HTMLNoiseReducer.BlockVector
HTML文書を構成するブロックのベクトル表現クラス。
|
修飾子とタイプ | フィールドと説明 |
---|---|
static float |
DEF_MIN_SCORE |
static java.lang.String |
DEF_TAGS |
static java.lang.String |
P_MIN_SCORE |
static java.lang.String |
P_PREFIX |
static java.lang.String |
P_TAGS |
protected java.util.Properties |
props |
コンストラクタと説明 |
---|
HTMLNoiseReducer(java.util.Properties props) |
修飾子とタイプ | メソッドと説明 |
---|---|
static void |
calculateTextVector(HTMLNoiseReducer.BlockVector blockVector,
java.lang.String value) |
static void |
calculateVector(HTMLNoiseReducer.BlockVector blockVector,
org.w3c.dom.Node block) |
void |
checkSimilarities(HTMLNoiseReducer.BlockVector originBlockVector,
java.util.Map<org.w3c.dom.Node,HTMLNoiseReducer.BlockVector> dest) |
static double |
cosine(HTMLNoiseReducer.BlockVector v1,
HTMLNoiseReducer.BlockVector v2) |
void |
createBlockVectors(java.util.Map<org.w3c.dom.Node,HTMLNoiseReducer.BlockVector> resultTable,
org.w3c.dom.Node node,
int pageId) |
org.w3c.dom.Node |
getDocument(org.xml.sax.InputSource is) |
org.w3c.dom.Node |
getDocument(java.lang.String strHtml) |
java.util.Set<java.lang.String> |
getRemovableTags(java.lang.String strTags)
カンマ区切りで記述されたHTMLタグ一覧を大文字に正規化した上で
Set に変換して返す。 |
static double |
getSumSquareRoot(java.util.Map<java.lang.String,java.lang.Integer> v) |
static HTMLNoiseReducer.BlockVector |
getVector(org.w3c.dom.Node parentBlock,
int pageId) |
boolean |
isDescendantAllNoisy(org.w3c.dom.Node node,
java.util.Map<org.w3c.dom.Node,HTMLNoiseReducer.BlockVector> nbvMap) |
boolean |
isRemovableBlockElement(org.w3c.dom.Node node) |
static void |
outputHtml(java.io.PrintWriter pw,
java.lang.String indent,
org.w3c.dom.Node html) |
org.w3c.dom.Node[] |
reduceNoise(java.io.File[] htmls,
int start,
int length,
java.lang.String encoding) |
org.w3c.dom.Node[] |
reduceNoise(org.xml.sax.InputSource... htmls) |
org.w3c.dom.Node[] |
reduceNoise(org.w3c.dom.Node[] doms) |
org.w3c.dom.Node[] |
reduceNoise(java.lang.String[] htmls,
int start,
int length) |
void |
removeNoisyNodes(org.w3c.dom.Node node,
java.util.Map<org.w3c.dom.Node,HTMLNoiseReducer.BlockVector> nbvMap) |
org.w3c.dom.Node |
removeNoisyNodes(org.w3c.dom.Node parent,
org.w3c.dom.Node node,
java.util.Map<org.w3c.dom.Node,HTMLNoiseReducer.BlockVector> nbvMap) |
protected void |
setProperties() |
public static final java.lang.String P_PREFIX
public static final java.lang.String P_MIN_SCORE
public static final java.lang.String P_TAGS
public static final float DEF_MIN_SCORE
public static final java.lang.String DEF_TAGS
protected java.util.Properties props
public java.util.Set<java.lang.String> getRemovableTags(java.lang.String strTags)
Set
に変換して返す。strTags
- カンマ区切り文字列のHTMLタグ一覧protected void setProperties()
public org.w3c.dom.Node[] reduceNoise(java.lang.String[] htmls, int start, int length) throws org.xml.sax.SAXException, java.io.IOException
org.xml.sax.SAXException
java.io.IOException
public org.w3c.dom.Node[] reduceNoise(java.io.File[] htmls, int start, int length, java.lang.String encoding) throws org.xml.sax.SAXException, java.io.IOException
org.xml.sax.SAXException
java.io.IOException
public org.w3c.dom.Node[] reduceNoise(org.xml.sax.InputSource... htmls) throws org.xml.sax.SAXException, java.io.IOException
org.xml.sax.SAXException
java.io.IOException
public org.w3c.dom.Node[] reduceNoise(org.w3c.dom.Node[] doms) throws org.xml.sax.SAXException, java.io.IOException
org.xml.sax.SAXException
java.io.IOException
public org.w3c.dom.Node getDocument(java.lang.String strHtml) throws org.xml.sax.SAXException, java.io.IOException
org.xml.sax.SAXException
java.io.IOException
public org.w3c.dom.Node getDocument(org.xml.sax.InputSource is) throws org.xml.sax.SAXException, java.io.IOException
org.xml.sax.SAXException
java.io.IOException
public void createBlockVectors(java.util.Map<org.w3c.dom.Node,HTMLNoiseReducer.BlockVector> resultTable, org.w3c.dom.Node node, int pageId)
public boolean isRemovableBlockElement(org.w3c.dom.Node node)
public static HTMLNoiseReducer.BlockVector getVector(org.w3c.dom.Node parentBlock, int pageId)
public static void calculateVector(HTMLNoiseReducer.BlockVector blockVector, org.w3c.dom.Node block)
public static void calculateTextVector(HTMLNoiseReducer.BlockVector blockVector, java.lang.String value)
public void checkSimilarities(HTMLNoiseReducer.BlockVector originBlockVector, java.util.Map<org.w3c.dom.Node,HTMLNoiseReducer.BlockVector> dest)
public static double cosine(HTMLNoiseReducer.BlockVector v1, HTMLNoiseReducer.BlockVector v2)
public static double getSumSquareRoot(java.util.Map<java.lang.String,java.lang.Integer> v)
public void removeNoisyNodes(org.w3c.dom.Node node, java.util.Map<org.w3c.dom.Node,HTMLNoiseReducer.BlockVector> nbvMap)
public org.w3c.dom.Node removeNoisyNodes(org.w3c.dom.Node parent, org.w3c.dom.Node node, java.util.Map<org.w3c.dom.Node,HTMLNoiseReducer.BlockVector> nbvMap)
public boolean isDescendantAllNoisy(org.w3c.dom.Node node, java.util.Map<org.w3c.dom.Node,HTMLNoiseReducer.BlockVector> nbvMap)
public static void outputHtml(java.io.PrintWriter pw, java.lang.String indent, org.w3c.dom.Node html)
Copyright © 2009-2018 RONDHUIT Co.,Ltd. All Rights Reserved.