extract working
This commit is contained in:
parent
1777ae8eca
commit
b0e8bd5f76
4
lib/__init__.py
Normal file
4
lib/__init__.py
Normal file
@ -0,0 +1,4 @@
|
||||
import pymongo
|
||||
import os
|
||||
import json
|
||||
import sys
|
27
src/extract.py
Normal file
27
src/extract.py
Normal file
@ -0,0 +1,27 @@
|
||||
import lib
|
||||
|
||||
# import paperless database export
|
||||
raw_manifest = open("/mnt/user/media/paperless/export/manifest.json")
|
||||
manifest = lib.json.load(raw_manifest)
|
||||
|
||||
mongo = lib.pymongo.MongoClient("localhost", 27017)
|
||||
db = mongo.paperless
|
||||
|
||||
# export directory for created md files
|
||||
directory = '/mnt/user/repos/docs/paperless_export'
|
||||
|
||||
def output(t, c):
|
||||
db.content.insert_one({"title": t, "content": c})
|
||||
|
||||
def parse():
|
||||
print("entering parse()")
|
||||
# for every document in the export
|
||||
for field in manifest:
|
||||
#if the title and content tags aren't blank
|
||||
if "title" in field["fields"]:
|
||||
title = field["fields"]["title"]
|
||||
if "content" in field["fields"]:
|
||||
content = field["fields"]["content"]
|
||||
if content != "" and title != "" and lib.sys.getsizeof(content) < 999999:
|
||||
print(title)
|
||||
output(title, content)
|
10
src/main.py
10
src/main.py
@ -1 +1,9 @@
|
||||
print("Hello world.")
|
||||
import lib
|
||||
import extract
|
||||
|
||||
def main():
|
||||
|
||||
extract.parse()
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
0
src/search.py
Normal file
0
src/search.py
Normal file
Loading…
Reference in New Issue
Block a user