62 lines
1.7 KiB
Python
62 lines
1.7 KiB
Python
|
|
|
|
import requests
|
|
import subprocess
|
|
from bs4 import BeautifulSoup
|
|
from urllib.parse import urljoin, urlsplit
|
|
import datetime
|
|
import os
|
|
import random
|
|
import re
|
|
import json
|
|
|
|
from sqlalchemy import func, select
|
|
from . import APP_BASE_DIR
|
|
|
|
CONFIG = json.load(open(os.path.join(APP_BASE_DIR, "config", "searchengine.json")))
|
|
|
|
WORDENIZER_PATH = os.getenv('WORDENIZER_PATH', os.path.join(APP_BASE_DIR, 'wordenizer'))
|
|
|
|
# import models definitions
|
|
|
|
from .models import db, Page, Href, AnchorWord, Word
|
|
|
|
|
|
|
|
class Query(object):
|
|
def __init__(self, words=(), exact_words=(), exclude=()):
|
|
self.words = words
|
|
self.exact_words = exact_words
|
|
self.exclude = exclude
|
|
@classmethod
|
|
def from_string(cls, s):
|
|
swa = s.lower()
|
|
sw = re.findall(r"-?\w+", swa)
|
|
words, exclude = [], []
|
|
for w in sw:
|
|
if (w.startswith('-')):
|
|
exclude.append(w.lstrip('-'))
|
|
else:
|
|
if w not in CONFIG['stopwords']:
|
|
words.append(w)
|
|
return cls(words=words, exclude=exclude)
|
|
def build(self, page=1):
|
|
wqc = len(self.words)
|
|
#if self.exclude:
|
|
# wq &= (Word.content != w)
|
|
q = None
|
|
for w in self.words:
|
|
q1 = (select(Page)
|
|
.join(Href, Href.page_id == Page.id)
|
|
.join(AnchorWord, Href.id == AnchorWord.to_page_id)
|
|
.join(Word).where(Word.content == w).group_by(Page.id))
|
|
if q is None:
|
|
q = q1
|
|
else:
|
|
q = q.intersect(q1)
|
|
q = q.order_by(func.sum(AnchorWord.count).desc())
|
|
return q
|
|
def is_empty(self):
|
|
return not self.words and not self.exact_words
|
|
|
|
...
|