xefyl/xefyl/search_engine.py

62 lines
1.7 KiB
Python

import requests
import subprocess
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlsplit
import datetime
import os
import random
import re
import json
from sqlalchemy import func, select
from . import APP_BASE_DIR
CONFIG = json.load(open(os.path.join(APP_BASE_DIR, "config", "searchengine.json")))
WORDENIZER_PATH = os.getenv('WORDENIZER_PATH', os.path.join(APP_BASE_DIR, 'wordenizer'))
# import models definitions
from .models import db, Page, Href, AnchorWord, Word
class Query(object):
def __init__(self, words=(), exact_words=(), exclude=()):
self.words = words
self.exact_words = exact_words
self.exclude = exclude
@classmethod
def from_string(cls, s):
swa = s.lower()
sw = re.findall(r"-?\w+", swa)
words, exclude = [], []
for w in sw:
if (w.startswith('-')):
exclude.append(w.lstrip('-'))
else:
if w not in CONFIG['stopwords']:
words.append(w)
return cls(words=words, exclude=exclude)
def build(self, page=1):
wqc = len(self.words)
#if self.exclude:
# wq &= (Word.content != w)
q = None
for w in self.words:
q1 = (select(Page)
.join(Href, Href.page_id == Page.id)
.join(AnchorWord, Href.id == AnchorWord.to_page_id)
.join(Word).where(Word.content == w).group_by(Page.id))
if q is None:
q = q1
else:
q = q.intersect(q1)
q = q.order_by(func.sum(AnchorWord.count).desc())
return q
def is_empty(self):
return not self.words and not self.exact_words
...