extract working
This commit is contained in:
parent
1777ae8eca
commit
b0e8bd5f76
4
lib/__init__.py
Normal file
4
lib/__init__.py
Normal file
@ -0,0 +1,4 @@
|
|||||||
|
import pymongo
|
||||||
|
import os
|
||||||
|
import json
|
||||||
|
import sys
|
27
src/extract.py
Normal file
27
src/extract.py
Normal file
@ -0,0 +1,27 @@
|
|||||||
|
import lib
|
||||||
|
|
||||||
|
# import paperless database export
|
||||||
|
raw_manifest = open("/mnt/user/media/paperless/export/manifest.json")
|
||||||
|
manifest = lib.json.load(raw_manifest)
|
||||||
|
|
||||||
|
mongo = lib.pymongo.MongoClient("localhost", 27017)
|
||||||
|
db = mongo.paperless
|
||||||
|
|
||||||
|
# export directory for created md files
|
||||||
|
directory = '/mnt/user/repos/docs/paperless_export'
|
||||||
|
|
||||||
|
def output(t, c):
|
||||||
|
db.content.insert_one({"title": t, "content": c})
|
||||||
|
|
||||||
|
def parse():
|
||||||
|
print("entering parse()")
|
||||||
|
# for every document in the export
|
||||||
|
for field in manifest:
|
||||||
|
#if the title and content tags aren't blank
|
||||||
|
if "title" in field["fields"]:
|
||||||
|
title = field["fields"]["title"]
|
||||||
|
if "content" in field["fields"]:
|
||||||
|
content = field["fields"]["content"]
|
||||||
|
if content != "" and title != "" and lib.sys.getsizeof(content) < 999999:
|
||||||
|
print(title)
|
||||||
|
output(title, content)
|
10
src/main.py
10
src/main.py
@ -1 +1,9 @@
|
|||||||
print("Hello world.")
|
import lib
|
||||||
|
import extract
|
||||||
|
|
||||||
|
def main():
|
||||||
|
|
||||||
|
extract.parse()
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
0
src/search.py
Normal file
0
src/search.py
Normal file
Loading…
Reference in New Issue
Block a user