Stripped personal data from development repository
Samo Penic
2019-02-20 83c3f647c35477564b77cbc5b36d37d793d5442a
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
import os
import django
 
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "aoi.settings")
django.setup()
from exam import models as exmodel
from scan import models as scmodel
from aoi_ocr.Ocr import Paper as ocr_paper
from sklearn.externals import joblib
from django.core.files import File
import json
import pkg_resources
from glob import glob
import pathlib
import time
import collections
import shutil
 
path = "/filename.joblib"  # always use slash
filepath = pkg_resources.resource_filename("aoi_ocr", path)
 
 
 
settings = {"sid_mask": "11x00xxx", "answer_threshold": 0.25}
classifier = joblib.load(filepath)
processed_dir="processedscans"
 
while True:
    c=collections.Counter(p.suffix for p in pathlib.Path("inputscans").glob('*.tif'))
    if c.get(".tif"):
        filelist = glob("inputscans/*.tif")
        wrong_sid = 0
        total = 0
        for f in sorted(filelist):
            print("processing: {}".format(f))
            p = ocr_paper(
                filename=f, sid_classifier=classifier, settings=settings
            ).get_paper_ocr_data()
            # print(f,p)
            if p is None:
                print("We got an empty page... Continuing.");
                continue
            if p["page_no"] == 1:
                total += 1
            if len(p["errors"]) != 0:
                wrong_sid += 1
            if total % 10 == 0:
                print("Total:{}, wrong SID: {}".format(total, wrong_sid))
            print(p["exam_id"], p["paper_id"], p["page_no"])
            ex = exmodel.Exam.objects.get(pk=int(p["exam_id"]))
            #pa = ex.generated.get(serial_no=int(p["paper_id"]))
            pa=exmodel.GeneratedPaper.objects.get(serial_no=int(p["paper_id"]), exam=ex)
            exists = scmodel.Scan.objects.filter(exam=ex, paper=pa, page_no=p["page_no"]).all()
            if len(exists) > 0:
                print("paper already exists in the scan list")
                continue
            sc = scmodel.Scan()
 
            sc.answer_matrix = p["ans_matrix"]
            sc.student_id = p["sid"]
            sc.ocr_debug = json.dumps(p)
            sc.exam = ex
            sc.paper = pa
            sc.page_no = p["page_no"]
            sc.scan_image.save(
                p["output_filename"].split("/")[-1], File(open(p["output_filename"], "rb"))
            )
 
            sc.save()
            plib_src=pathlib.Path(f)
            plib_dest=pathlib.Path(processed_dir).joinpath(plib_src.stem+"__"+str(int(time.time()))+plib_src.suffix)
            if not plib_dest.exists():
#                plib_src.replace(plib_dest)
                #plib probably doesn't work since in docker different directories are different filesystems? Verify.
                shutil.move(str(plib_src),str(plib_dest))
        print("Total:{}, wrong SID: {}".format(total, wrong_sid))
    else:
        time.sleep(10)