0.2.0 initial commit
This commit is contained in:
commit
6f67d125af
38 changed files with 2051 additions and 0 deletions
62
xefyl/search_engine.py
Normal file
62
xefyl/search_engine.py
Normal file
|
|
@ -0,0 +1,62 @@
|
|||
|
||||
|
||||
import requests
|
||||
import subprocess
|
||||
from bs4 import BeautifulSoup
|
||||
from urllib.parse import urljoin, urlsplit
|
||||
import datetime
|
||||
import os
|
||||
import random
|
||||
import re
|
||||
import json
|
||||
|
||||
from sqlalchemy import func, select
|
||||
from . import APP_BASE_DIR
|
||||
|
||||
CONFIG = json.load(open(os.path.join(APP_BASE_DIR, "config", "searchengine.json")))
|
||||
|
||||
WORDENIZER_PATH = os.getenv('WORDENIZER_PATH', os.path.join(APP_BASE_DIR, 'wordenizer'))
|
||||
|
||||
# import models definitions
|
||||
|
||||
from .models import db, Page, Href, AnchorWord, Word
|
||||
|
||||
|
||||
|
||||
class Query(object):
|
||||
def __init__(self, words=(), exact_words=(), exclude=()):
|
||||
self.words = words
|
||||
self.exact_words = exact_words
|
||||
self.exclude = exclude
|
||||
@classmethod
|
||||
def from_string(cls, s):
|
||||
swa = s.lower()
|
||||
sw = re.findall(r"-?\w+", swa)
|
||||
words, exclude = [], []
|
||||
for w in sw:
|
||||
if (w.startswith('-')):
|
||||
exclude.append(w.lstrip('-'))
|
||||
else:
|
||||
if w not in CONFIG['stopwords']:
|
||||
words.append(w)
|
||||
return cls(words=words, exclude=exclude)
|
||||
def build(self, page=1):
|
||||
wqc = len(self.words)
|
||||
#if self.exclude:
|
||||
# wq &= (Word.content != w)
|
||||
q = None
|
||||
for w in self.words:
|
||||
q1 = (select(Page)
|
||||
.join(Href, Href.page_id == Page.id)
|
||||
.join(AnchorWord, Href.id == AnchorWord.to_page_id)
|
||||
.join(Word).where(Word.content == w).group_by(Page.id))
|
||||
if q is None:
|
||||
q = q1
|
||||
else:
|
||||
q = q.intersect(q1)
|
||||
q = q.order_by(func.sum(AnchorWord.count).desc())
|
||||
return q
|
||||
def is_empty(self):
|
||||
return not self.words and not self.exact_words
|
||||
|
||||
...
|
||||
Loading…
Add table
Add a link
Reference in a new issue