finished basic functionality

This commit is contained in:
Daniel 2024-09-26 10:04:03 -05:00
parent b0e8bd5f76
commit d48dc97eda
4 changed files with 90 additions and 15 deletions

View File

@ -1,4 +1,7 @@
import pymongo import pymongo
import os import os
import json import json
import sys import sys
import pprint
db = pymongo.MongoClient("localhost", 27017).paperless.content

View File

@ -4,24 +4,52 @@ import lib
raw_manifest = open("/mnt/user/media/paperless/export/manifest.json") raw_manifest = open("/mnt/user/media/paperless/export/manifest.json")
manifest = lib.json.load(raw_manifest) manifest = lib.json.load(raw_manifest)
mongo = lib.pymongo.MongoClient("localhost", 27017)
db = mongo.paperless
# export directory for created md files # export directory for created md files
directory = '/mnt/user/repos/docs/paperless_export' directory = '/mnt/user/repos/docs/paperless_export'
def output(t, c): index_num = 1
db.content.insert_one({"title": t, "content": c})
existing, inserted, big, duplicates = 0, 0, 0, 0
def insert(r):
lib.db.insert_one({"title": r["title"],
"content": r["content"],
"checksum": r["check"],
"index": r["index"]})
def exists(r):
global existing, duplicates
record = lib.db.find_one({"checksum": r["check"]})
dupe = lib.db.find_one({"content": r["content"], "title": r["title"]})
if dupe:
if record:
existing = existing + 1
return True
duplicates = duplicates + 1
return True
else: return False
def parse(): def parse():
global inserted, big, index_num
print("entering parse()") print("entering parse()")
# for every document in the export # for every document in the export
for field in manifest: for document in manifest:
#if the title and content tags aren't blank #if the title and content tags aren't blank
if "title" in field["fields"]: try:
title = field["fields"]["title"] record = {"title": document["fields"]["title"],
if "content" in field["fields"]: "content": document["fields"]["content"],
content = field["fields"]["content"] "check": document["fields"]["checksum"],
if content != "" and title != "" and lib.sys.getsizeof(content) < 999999: "index": index_num}
print(title)
output(title, content) if lib.sys.getsizeof(record["content"]) < 16777216:
if record["content"] != "" and record["title"] != "":
if not exists(record):
insert(record)
inserted = inserted + 1
index_num = index_num + 1
else: big = big + 1
except KeyError:
continue

View File

@ -1,9 +1,31 @@
import lib
import extract import extract
import search
import pprint
def prompt():
return input("Search Value: ")
def menu():
value = prompt()
while value != "quit":
results = search.query(value)
if type(results) == dict:
print(results["content"])
else:
for doc in results:
print(doc["index"], " : ", doc["title"])
value = prompt()
def main(): def main():
extract.parse() extract.parse()
print("Existing:", extract.existing)
print("Inserted:", extract.inserted)
print("To big:", extract.big)
print("Dupes:", extract.duplicates)
menu()
if __name__ == '__main__': if __name__ == '__main__':
main() main()

View File

@ -0,0 +1,22 @@
from pprint import pprint
import lib
def query(s):
q = lib.db.find(
{
"$or":
[
{"content": {"$regex": s}},
{"title": {"$regex": s}},
{"index": s if not s.isdigit() else int(s)}
]
}
)
results = [document for document in q]
for item in results:
if item["index"] == s if not s.isdigit() else int(s):
return lib.db.find_one({"index": int(s)})
return results