0.2.0 initial commit

This commit is contained in:
Yusur 2025-10-08 14:46:09 +02:00
commit 6f67d125af
38 changed files with 2051 additions and 0 deletions

62
xefyl/search_engine.py Normal file
View file

@ -0,0 +1,62 @@
import requests
import subprocess
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlsplit
import datetime
import os
import random
import re
import json
from sqlalchemy import func, select
from . import APP_BASE_DIR
CONFIG = json.load(open(os.path.join(APP_BASE_DIR, "config", "searchengine.json")))
WORDENIZER_PATH = os.getenv('WORDENIZER_PATH', os.path.join(APP_BASE_DIR, 'wordenizer'))
# import models definitions
from .models import db, Page, Href, AnchorWord, Word
class Query(object):
def __init__(self, words=(), exact_words=(), exclude=()):
self.words = words
self.exact_words = exact_words
self.exclude = exclude
@classmethod
def from_string(cls, s):
swa = s.lower()
sw = re.findall(r"-?\w+", swa)
words, exclude = [], []
for w in sw:
if (w.startswith('-')):
exclude.append(w.lstrip('-'))
else:
if w not in CONFIG['stopwords']:
words.append(w)
return cls(words=words, exclude=exclude)
def build(self, page=1):
wqc = len(self.words)
#if self.exclude:
# wq &= (Word.content != w)
q = None
for w in self.words:
q1 = (select(Page)
.join(Href, Href.page_id == Page.id)
.join(AnchorWord, Href.id == AnchorWord.to_page_id)
.join(Word).where(Word.content == w).group_by(Page.id))
if q is None:
q = q1
else:
q = q.intersect(q1)
q = q.order_by(func.sum(AnchorWord.count).desc())
return q
def is_empty(self):
return not self.words and not self.exact_words
...