extract working

This commit is contained in:
Daniel 2024-09-26 04:52:13 -05:00
parent 1777ae8eca
commit b0e8bd5f76
4 changed files with 40 additions and 1 deletions

4
lib/__init__.py Normal file
View File

@ -0,0 +1,4 @@
import pymongo
import os
import json
import sys

27
src/extract.py Normal file
View File

@ -0,0 +1,27 @@
import lib
# import paperless database export
raw_manifest = open("/mnt/user/media/paperless/export/manifest.json")
manifest = lib.json.load(raw_manifest)
mongo = lib.pymongo.MongoClient("localhost", 27017)
db = mongo.paperless
# export directory for created md files
directory = '/mnt/user/repos/docs/paperless_export'
def output(t, c):
db.content.insert_one({"title": t, "content": c})
def parse():
print("entering parse()")
# for every document in the export
for field in manifest:
#if the title and content tags aren't blank
if "title" in field["fields"]:
title = field["fields"]["title"]
if "content" in field["fields"]:
content = field["fields"]["content"]
if content != "" and title != "" and lib.sys.getsizeof(content) < 999999:
print(title)
output(title, content)

View File

@ -1 +1,9 @@
print("Hello world.") import lib
import extract
def main():
extract.parse()
if __name__ == '__main__':
main()

0
src/search.py Normal file
View File