Documentation
¶
Overview ¶
Package classifier implements form and field type classification.
Index ¶
- func ElemFeatures(elem *goquery.Selection, form *goquery.Selection) map[string]any
- func GetFormFeatures(form *goquery.Selection, formType string, fieldElems []*goquery.Selection) []map[string]any
- type ClassifyProbaResult
- type ClassifyResult
- type FeaturePipeline
- type FieldTypeModel
- type FormCSS
- type FormElements
- type FormFeatureExtractor
- type FormFieldClassifier
- func (c *FormFieldClassifier) Classify(form *goquery.Selection, fields bool) ClassifyResult
- func (c *FormFieldClassifier) ClassifyPage(doc *goquery.Document) string
- func (c *FormFieldClassifier) ClassifyPageProba(doc *goquery.Document, threshold float64) map[string]float64
- func (c *FormFieldClassifier) ClassifyProba(form *goquery.Selection, threshold float64, fields bool) ClassifyProbaResult
- func (c *FormFieldClassifier) ExtractForms(htmlStr string, proba bool, threshold float64, classifyFields bool) ([]FormResult, error)
- func (c *FormFieldClassifier) ExtractFormsFromReader(r *strings.Reader, proba bool, threshold float64, classifyFields bool) ([]FormResult, error)
- func (c *FormFieldClassifier) ExtractPage(htmlStr string, proba bool, threshold float64, classifyFields bool) ([]FormResult, ClassifyResult, ClassifyProbaResult, error)
- func (c *FormFieldClassifier) SaveModel(path string) error
- type FormInputCSS
- type FormInputNames
- type FormInputTitle
- type FormLabelText
- type FormLinksText
- type FormResult
- type FormTypeModel
- type FormTypeSummaryExtractor
- type FormTypeTrainConfig
- type FormURL
- type PageBodyTextExtractor
- type PageCSSExtractor
- type PageFeatureExtractor
- type PageFeaturePipeline
- type PageH1Extractor
- type PageHeadingsExtractor
- type PageMetaDescriptionExtractor
- type PageNavTextExtractor
- type PageStructureExtractor
- type PageTitleExtractor
- type PageTypeModel
- type PageTypeTrainConfig
- type PageURLExtractor
- type SerializedPipeline
- type SubmitText
- type UnifiedModel
Constants ¶
This section is empty.
Variables ¶
This section is empty.
Functions ¶
func ElemFeatures ¶
ElemFeatures extracts per-field features for CRF classification.
Types ¶
type ClassifyProbaResult ¶
type ClassifyProbaResult struct {
Form map[string]float64 `json:"form"`
Fields map[string]map[string]float64 `json:"fields,omitempty"`
}
ClassifyProbaResult holds probability-based classification results.
type ClassifyResult ¶
type ClassifyResult struct {
Form string `json:"form"`
Fields map[string]string `json:"fields,omitempty"`
}
ClassifyResult holds the classification result for a form.
type FeaturePipeline ¶
type FeaturePipeline struct {
Name string
Extractor FormFeatureExtractor
VecType string // "dict", "count", "tfidf"
NgramRange [2]int
MinDF int
Binary bool
Analyzer string
StopWords map[string]bool
UseEnglishStop bool
}
FeaturePipeline describes a feature extraction + vectorization pipeline.
func DefaultFeaturePipelines ¶
func DefaultFeaturePipelines() []FeaturePipeline
DefaultFeaturePipelines returns the 9 feature extraction pipelines matching Formasaurus's FEATURES list.
type FieldTypeModel ¶
FieldTypeModel wraps a CRF model for field type classification.
func TrainFieldType ¶
func TrainFieldType(sequences []crf.TrainingSequence, config crf.TrainerConfig) *FieldTypeModel
TrainFieldType trains a CRF model for field type classification.
func (*FieldTypeModel) ClassifyProba ¶
func (m *FieldTypeModel) ClassifyProba(form *goquery.Selection, formType string) map[string]map[string]float64
ClassifyProba returns field type probabilities for a form.
type FormElements ¶
type FormElements struct{}
FormElements extracts structural boolean features from a form.
func (FormElements) ExtractDict ¶
func (f FormElements) ExtractDict(form *goquery.Selection) map[string]any
func (FormElements) ExtractString ¶
func (f FormElements) ExtractString(_ *goquery.Selection) string
func (FormElements) IsDict ¶
func (f FormElements) IsDict() bool
type FormFeatureExtractor ¶
type FormFeatureExtractor interface {
ExtractString(form *goquery.Selection) string
ExtractDict(form *goquery.Selection) map[string]any
IsDict() bool
}
FormFeatureExtractor extracts features from a form element.
type FormFieldClassifier ¶
type FormFieldClassifier struct {
FormModel *FormTypeModel
FieldModel *FieldTypeModel
PageModel *PageTypeModel
}
FormFieldClassifier detects HTML form, field, and page types.
func LoadClassifier ¶
func LoadClassifier(path string) (*FormFieldClassifier, error)
LoadClassifier loads a FormFieldClassifier from disk.
func (*FormFieldClassifier) Classify ¶
func (c *FormFieldClassifier) Classify(form *goquery.Selection, fields bool) ClassifyResult
Classify returns the form type and field types.
func (*FormFieldClassifier) ClassifyPage ¶ added in v0.0.3
func (c *FormFieldClassifier) ClassifyPage(doc *goquery.Document) string
ClassifyPage classifies the page type using form results as features.
func (*FormFieldClassifier) ClassifyPageProba ¶ added in v0.0.3
func (c *FormFieldClassifier) ClassifyPageProba(doc *goquery.Document, threshold float64) map[string]float64
ClassifyPageProba returns page type probabilities.
func (*FormFieldClassifier) ClassifyProba ¶
func (c *FormFieldClassifier) ClassifyProba(form *goquery.Selection, threshold float64, fields bool) ClassifyProbaResult
ClassifyProba returns probabilities for form and field types.
func (*FormFieldClassifier) ExtractForms ¶
func (c *FormFieldClassifier) ExtractForms(htmlStr string, proba bool, threshold float64, classifyFields bool) ([]FormResult, error)
ExtractForms extracts and classifies all forms from HTML.
func (*FormFieldClassifier) ExtractFormsFromReader ¶
func (c *FormFieldClassifier) ExtractFormsFromReader(r *strings.Reader, proba bool, threshold float64, classifyFields bool) ([]FormResult, error)
ExtractFormsFromReader extracts and classifies forms from an io.Reader.
func (*FormFieldClassifier) ExtractPage ¶ added in v0.0.3
func (c *FormFieldClassifier) ExtractPage(htmlStr string, proba bool, threshold float64, classifyFields bool) ([]FormResult, ClassifyResult, ClassifyProbaResult, error)
ExtractPage classifies both the page type and forms from HTML.
func (*FormFieldClassifier) SaveModel ¶
func (c *FormFieldClassifier) SaveModel(path string) error
SaveModel saves the classifier to disk.
type FormInputCSS ¶
type FormInputCSS struct{}
FormInputCSS extracts CSS of non-hidden inputs.
func (FormInputCSS) ExtractDict ¶
func (f FormInputCSS) ExtractDict(form *goquery.Selection) map[string]any
func (FormInputCSS) ExtractString ¶
func (f FormInputCSS) ExtractString(form *goquery.Selection) string
func (FormInputCSS) IsDict ¶
func (f FormInputCSS) IsDict() bool
type FormInputNames ¶
type FormInputNames struct{}
FormInputNames extracts names of non-hidden inputs.
func (FormInputNames) ExtractDict ¶
func (f FormInputNames) ExtractDict(form *goquery.Selection) map[string]any
func (FormInputNames) ExtractString ¶
func (f FormInputNames) ExtractString(form *goquery.Selection) string
func (FormInputNames) IsDict ¶
func (f FormInputNames) IsDict() bool
type FormInputTitle ¶
type FormInputTitle struct{}
FormInputTitle extracts title attributes of non-hidden inputs.
func (FormInputTitle) ExtractDict ¶
func (f FormInputTitle) ExtractDict(form *goquery.Selection) map[string]any
func (FormInputTitle) ExtractString ¶
func (f FormInputTitle) ExtractString(form *goquery.Selection) string
func (FormInputTitle) IsDict ¶
func (f FormInputTitle) IsDict() bool
type FormLabelText ¶
type FormLabelText struct{}
FormLabelText extracts label text inside the form.
func (FormLabelText) ExtractDict ¶
func (f FormLabelText) ExtractDict(form *goquery.Selection) map[string]any
func (FormLabelText) ExtractString ¶
func (f FormLabelText) ExtractString(form *goquery.Selection) string
func (FormLabelText) IsDict ¶
func (f FormLabelText) IsDict() bool
type FormLinksText ¶
type FormLinksText struct{}
FormLinksText extracts link text inside the form.
func (FormLinksText) ExtractDict ¶
func (f FormLinksText) ExtractDict(_ *goquery.Selection) map[string]any
func (FormLinksText) ExtractString ¶
func (f FormLinksText) ExtractString(form *goquery.Selection) string
func (FormLinksText) IsDict ¶
func (f FormLinksText) IsDict() bool
type FormResult ¶
type FormResult struct {
FormHTML string `json:"form_html"`
Result ClassifyResult `json:"result,omitempty"`
Proba ClassifyProbaResult `json:"proba,omitempty"`
}
FormResult holds the result for a single form.
type FormTypeModel ¶
type FormTypeModel struct {
Classes []string `json:"classes"`
Coef [][]float64 `json:"coef"` // [numClasses][numFeatures]
Intercept []float64 `json:"intercept"` // [numClasses]
Pipelines []SerializedPipeline `json:"pipelines"`
// contains filtered or unexported fields
}
FormTypeModel holds a trained form type classifier.
func TrainFormType ¶
func TrainFormType(forms []*goquery.Selection, labels []string, config FormTypeTrainConfig) *FormTypeModel
TrainFormType trains a form type classifier.
func (*FormTypeModel) Classify ¶
func (m *FormTypeModel) Classify(form *goquery.Selection) string
Classify returns the predicted form type.
func (*FormTypeModel) ClassifyProba ¶
func (m *FormTypeModel) ClassifyProba(form *goquery.Selection) map[string]float64
ClassifyProba returns probabilities for each form type.
func (*FormTypeModel) InitRuntime ¶
func (m *FormTypeModel) InitRuntime()
InitRuntime initializes runtime state from serialized pipelines.
type FormTypeSummaryExtractor ¶ added in v0.0.3
type FormTypeSummaryExtractor struct{}
FormTypeSummaryExtractor extracts features from form classification results.
func (FormTypeSummaryExtractor) ExtractDict ¶ added in v0.0.3
func (e FormTypeSummaryExtractor) ExtractDict(_ *goquery.Document, formResults []ClassifyResult) map[string]any
func (FormTypeSummaryExtractor) ExtractString ¶ added in v0.0.3
func (e FormTypeSummaryExtractor) ExtractString(_ *goquery.Document, _ []ClassifyResult) string
func (FormTypeSummaryExtractor) IsDict ¶ added in v0.0.3
func (e FormTypeSummaryExtractor) IsDict() bool
type FormTypeTrainConfig ¶
FormTypeTrainConfig holds training configuration.
func DefaultFormTypeTrainConfig ¶
func DefaultFormTypeTrainConfig() FormTypeTrainConfig
DefaultFormTypeTrainConfig returns default training config.
type PageBodyTextExtractor ¶ added in v0.0.3
type PageBodyTextExtractor struct{}
PageBodyTextExtractor extracts visible body text (first 2000 chars).
func (PageBodyTextExtractor) ExtractDict ¶ added in v0.0.3
func (e PageBodyTextExtractor) ExtractDict(_ *goquery.Document, _ []ClassifyResult) map[string]any
func (PageBodyTextExtractor) ExtractString ¶ added in v0.0.3
func (e PageBodyTextExtractor) ExtractString(doc *goquery.Document, _ []ClassifyResult) string
func (PageBodyTextExtractor) IsDict ¶ added in v0.0.3
func (e PageBodyTextExtractor) IsDict() bool
type PageCSSExtractor ¶ added in v0.0.3
type PageCSSExtractor struct{}
PageCSSExtractor extracts body/main class and id attributes.
func (PageCSSExtractor) ExtractDict ¶ added in v0.0.3
func (e PageCSSExtractor) ExtractDict(_ *goquery.Document, _ []ClassifyResult) map[string]any
func (PageCSSExtractor) ExtractString ¶ added in v0.0.3
func (e PageCSSExtractor) ExtractString(doc *goquery.Document, _ []ClassifyResult) string
func (PageCSSExtractor) IsDict ¶ added in v0.0.3
func (e PageCSSExtractor) IsDict() bool
type PageFeatureExtractor ¶ added in v0.0.3
type PageFeatureExtractor interface {
ExtractString(doc *goquery.Document, formResults []ClassifyResult) string
ExtractDict(doc *goquery.Document, formResults []ClassifyResult) map[string]any
IsDict() bool
}
PageFeatureExtractor extracts features from a page document.
type PageFeaturePipeline ¶ added in v0.0.3
type PageFeaturePipeline struct {
Name string
Extractor PageFeatureExtractor
VecType string // "dict", "tfidf"
NgramRange [2]int
MinDF int
Binary bool
Analyzer string
StopWords map[string]bool
UseEnglishStop bool
}
PageFeaturePipeline describes a page feature extraction + vectorization pipeline.
func DefaultPageFeaturePipelines ¶ added in v0.0.3
func DefaultPageFeaturePipelines() []PageFeaturePipeline
DefaultPageFeaturePipelines returns the 9 page feature extraction pipelines.
type PageH1Extractor ¶ added in v0.0.3
type PageH1Extractor struct{}
PageH1Extractor extracts <h1> text.
func (PageH1Extractor) ExtractDict ¶ added in v0.0.3
func (e PageH1Extractor) ExtractDict(_ *goquery.Document, _ []ClassifyResult) map[string]any
func (PageH1Extractor) ExtractString ¶ added in v0.0.3
func (e PageH1Extractor) ExtractString(doc *goquery.Document, _ []ClassifyResult) string
func (PageH1Extractor) IsDict ¶ added in v0.0.3
func (e PageH1Extractor) IsDict() bool
type PageHeadingsExtractor ¶ added in v0.0.3
type PageHeadingsExtractor struct{}
PageHeadingsExtractor extracts all h1-h6 text concatenated.
func (PageHeadingsExtractor) ExtractDict ¶ added in v0.0.3
func (e PageHeadingsExtractor) ExtractDict(_ *goquery.Document, _ []ClassifyResult) map[string]any
func (PageHeadingsExtractor) ExtractString ¶ added in v0.0.3
func (e PageHeadingsExtractor) ExtractString(doc *goquery.Document, _ []ClassifyResult) string
func (PageHeadingsExtractor) IsDict ¶ added in v0.0.3
func (e PageHeadingsExtractor) IsDict() bool
type PageMetaDescriptionExtractor ¶ added in v0.0.3
type PageMetaDescriptionExtractor struct{}
PageMetaDescriptionExtractor extracts <meta name="description"> content.
func (PageMetaDescriptionExtractor) ExtractDict ¶ added in v0.0.3
func (e PageMetaDescriptionExtractor) ExtractDict(_ *goquery.Document, _ []ClassifyResult) map[string]any
func (PageMetaDescriptionExtractor) ExtractString ¶ added in v0.0.3
func (e PageMetaDescriptionExtractor) ExtractString(doc *goquery.Document, _ []ClassifyResult) string
func (PageMetaDescriptionExtractor) IsDict ¶ added in v0.0.3
func (e PageMetaDescriptionExtractor) IsDict() bool
type PageNavTextExtractor ¶ added in v0.0.3
type PageNavTextExtractor struct{}
PageNavTextExtractor extracts <nav> text.
func (PageNavTextExtractor) ExtractDict ¶ added in v0.0.3
func (e PageNavTextExtractor) ExtractDict(_ *goquery.Document, _ []ClassifyResult) map[string]any
func (PageNavTextExtractor) ExtractString ¶ added in v0.0.3
func (e PageNavTextExtractor) ExtractString(doc *goquery.Document, _ []ClassifyResult) string
func (PageNavTextExtractor) IsDict ¶ added in v0.0.3
func (e PageNavTextExtractor) IsDict() bool
type PageStructureExtractor ¶ added in v0.0.3
type PageStructureExtractor struct{}
PageStructureExtractor extracts structural features + error indicators.
func (PageStructureExtractor) ExtractDict ¶ added in v0.0.3
func (e PageStructureExtractor) ExtractDict(doc *goquery.Document, _ []ClassifyResult) map[string]any
func (PageStructureExtractor) ExtractString ¶ added in v0.0.3
func (e PageStructureExtractor) ExtractString(_ *goquery.Document, _ []ClassifyResult) string
func (PageStructureExtractor) IsDict ¶ added in v0.0.3
func (e PageStructureExtractor) IsDict() bool
type PageTitleExtractor ¶ added in v0.0.3
type PageTitleExtractor struct{}
PageTitleExtractor extracts <title> text.
func (PageTitleExtractor) ExtractDict ¶ added in v0.0.3
func (e PageTitleExtractor) ExtractDict(_ *goquery.Document, _ []ClassifyResult) map[string]any
func (PageTitleExtractor) ExtractString ¶ added in v0.0.3
func (e PageTitleExtractor) ExtractString(doc *goquery.Document, _ []ClassifyResult) string
func (PageTitleExtractor) IsDict ¶ added in v0.0.3
func (e PageTitleExtractor) IsDict() bool
type PageTypeModel ¶ added in v0.0.3
type PageTypeModel struct {
Classes []string `json:"classes"`
Coef [][]float64 `json:"coef"`
Intercept []float64 `json:"intercept"`
Pipelines []SerializedPipeline `json:"pipelines"`
// contains filtered or unexported fields
}
PageTypeModel holds a trained page type classifier.
func TrainPageType ¶ added in v0.0.3
func TrainPageType(docs []*goquery.Document, formResults [][]ClassifyResult, urls []string, labels []string, config PageTypeTrainConfig) *PageTypeModel
TrainPageType trains a page type classifier.
func (*PageTypeModel) Classify ¶ added in v0.0.3
func (m *PageTypeModel) Classify(doc *goquery.Document, formResults []ClassifyResult) string
Classify returns the predicted page type.
func (*PageTypeModel) ClassifyProba ¶ added in v0.0.3
func (m *PageTypeModel) ClassifyProba(doc *goquery.Document, formResults []ClassifyResult) map[string]float64
ClassifyProba returns probabilities for each page type.
func (*PageTypeModel) InitRuntime ¶ added in v0.0.3
func (m *PageTypeModel) InitRuntime()
InitRuntime initializes runtime state from serialized pipelines.
type PageTypeTrainConfig ¶ added in v0.0.3
type PageTypeTrainConfig struct {
C float64
MaxIter int
Verbose bool
BalanceClass bool // use balanced class weights
}
PageTypeTrainConfig holds training configuration for the page type model.
func DefaultPageTypeTrainConfig ¶ added in v0.0.3
func DefaultPageTypeTrainConfig() PageTypeTrainConfig
DefaultPageTypeTrainConfig returns default training config.
type PageURLExtractor ¶ added in v0.0.3
type PageURLExtractor struct {
URL string // set per-document before extraction
}
PageURLExtractor extracts URL path patterns.
func (PageURLExtractor) ExtractDict ¶ added in v0.0.3
func (e PageURLExtractor) ExtractDict(_ *goquery.Document, _ []ClassifyResult) map[string]any
func (PageURLExtractor) ExtractString ¶ added in v0.0.3
func (e PageURLExtractor) ExtractString(_ *goquery.Document, _ []ClassifyResult) string
func (PageURLExtractor) IsDict ¶ added in v0.0.3
func (e PageURLExtractor) IsDict() bool
type SerializedPipeline ¶
type SerializedPipeline struct {
Name string `json:"name"`
ExtractorType string `json:"extractor_type"`
VecType string `json:"vec_type"`
DictVec *vectorizer.DictVectorizer `json:"dict_vec,omitempty"`
CountVec *vectorizer.CountVectorizer `json:"count_vec,omitempty"`
TfidfVec *vectorizer.TfidfVectorizer `json:"tfidf_vec,omitempty"`
}
SerializedPipeline holds the serialized state of a feature pipeline.
type SubmitText ¶
type SubmitText struct{}
SubmitText extracts submit button text.
func (SubmitText) ExtractDict ¶
func (f SubmitText) ExtractDict(_ *goquery.Selection) map[string]any
func (SubmitText) ExtractString ¶
func (f SubmitText) ExtractString(form *goquery.Selection) string
func (SubmitText) IsDict ¶
func (f SubmitText) IsDict() bool
type UnifiedModel ¶
type UnifiedModel struct {
FormModel *FormTypeModel `json:"form_model"`
FieldModel *crf.Model `json:"field_model"`
PageModel *PageTypeModel `json:"page_model"`
}
UnifiedModel holds form, field, and page models for serialization.