Retrieve recognised text from JSON returned by Microsoft OCR











up vote
3
down vote

favorite












The Microsoft OCR API returns json, and if I want to extact the text data from this json:



response = 
{
"language": "en",
"textAngle": -2.0000000000000338,
"orientation": "Up",
"regions": [
{
"boundingBox": "462,379,497,258",
"lines": [
{
"boundingBox": "462,379,497,74",
"words": [
{
"boundingBox": "462,379,41,73",
"text": "A"
},
{
"boundingBox": "523,379,153,73",
"text": "GOAL"
},
{
"boundingBox": "694,379,265,74",
"text": "WITHOUT"
}
]
},
{
"boundingBox": "565,471,289,74",
"words": [
{
"boundingBox": "565,471,41,73",
"text": "A"
},
{
"boundingBox": "626,471,150,73",
"text": "PLAN"
},
{
"boundingBox": "801,472,53,73",
"text": "IS"
}
]
},
{
"boundingBox": "519,563,375,74",
"words": [
{
"boundingBox": "519,563,149,74",
"text": "JUST"
},
{
"boundingBox": "683,564,41,72",
"text": "A"
},
{
"boundingBox": "741,564,153,73",
"text": "WISH"
}
]
}
]
}
]
}




def check_for_word(ocr):
# Initialise our subject to None
print("OCR: {}".format(ocr))
subject = None
for region in ocr["regions"]:
if "lines" in region:
for lines in region["lines"]:
if "words" in lines:
for word in lines["words"]:
if "text" in word:
subject = word["text"].lower()
break

print("OCR word is {}".format(subject))
return subject

print(response["regions"][0]["lines"][0]["words"][0]["text"]) # Should return this

print(check_for_word(response))



  • Each dictionary has arrays and we are unsure if the array contains any element

  • Also not sure if the dictionary has key


Let's say we just wish to return the first text it matched from the image file.



This code works but it has a deep nested structure that has bad smell. Is there a better practice to write this in a cleaner way?










share|improve this question









New contributor




zcahfg2 is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
Check out our Code of Conduct.
























    up vote
    3
    down vote

    favorite












    The Microsoft OCR API returns json, and if I want to extact the text data from this json:



    response = 
    {
    "language": "en",
    "textAngle": -2.0000000000000338,
    "orientation": "Up",
    "regions": [
    {
    "boundingBox": "462,379,497,258",
    "lines": [
    {
    "boundingBox": "462,379,497,74",
    "words": [
    {
    "boundingBox": "462,379,41,73",
    "text": "A"
    },
    {
    "boundingBox": "523,379,153,73",
    "text": "GOAL"
    },
    {
    "boundingBox": "694,379,265,74",
    "text": "WITHOUT"
    }
    ]
    },
    {
    "boundingBox": "565,471,289,74",
    "words": [
    {
    "boundingBox": "565,471,41,73",
    "text": "A"
    },
    {
    "boundingBox": "626,471,150,73",
    "text": "PLAN"
    },
    {
    "boundingBox": "801,472,53,73",
    "text": "IS"
    }
    ]
    },
    {
    "boundingBox": "519,563,375,74",
    "words": [
    {
    "boundingBox": "519,563,149,74",
    "text": "JUST"
    },
    {
    "boundingBox": "683,564,41,72",
    "text": "A"
    },
    {
    "boundingBox": "741,564,153,73",
    "text": "WISH"
    }
    ]
    }
    ]
    }
    ]
    }




    def check_for_word(ocr):
    # Initialise our subject to None
    print("OCR: {}".format(ocr))
    subject = None
    for region in ocr["regions"]:
    if "lines" in region:
    for lines in region["lines"]:
    if "words" in lines:
    for word in lines["words"]:
    if "text" in word:
    subject = word["text"].lower()
    break

    print("OCR word is {}".format(subject))
    return subject

    print(response["regions"][0]["lines"][0]["words"][0]["text"]) # Should return this

    print(check_for_word(response))



    • Each dictionary has arrays and we are unsure if the array contains any element

    • Also not sure if the dictionary has key


    Let's say we just wish to return the first text it matched from the image file.



    This code works but it has a deep nested structure that has bad smell. Is there a better practice to write this in a cleaner way?










    share|improve this question









    New contributor




    zcahfg2 is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
    Check out our Code of Conduct.






















      up vote
      3
      down vote

      favorite









      up vote
      3
      down vote

      favorite











      The Microsoft OCR API returns json, and if I want to extact the text data from this json:



      response = 
      {
      "language": "en",
      "textAngle": -2.0000000000000338,
      "orientation": "Up",
      "regions": [
      {
      "boundingBox": "462,379,497,258",
      "lines": [
      {
      "boundingBox": "462,379,497,74",
      "words": [
      {
      "boundingBox": "462,379,41,73",
      "text": "A"
      },
      {
      "boundingBox": "523,379,153,73",
      "text": "GOAL"
      },
      {
      "boundingBox": "694,379,265,74",
      "text": "WITHOUT"
      }
      ]
      },
      {
      "boundingBox": "565,471,289,74",
      "words": [
      {
      "boundingBox": "565,471,41,73",
      "text": "A"
      },
      {
      "boundingBox": "626,471,150,73",
      "text": "PLAN"
      },
      {
      "boundingBox": "801,472,53,73",
      "text": "IS"
      }
      ]
      },
      {
      "boundingBox": "519,563,375,74",
      "words": [
      {
      "boundingBox": "519,563,149,74",
      "text": "JUST"
      },
      {
      "boundingBox": "683,564,41,72",
      "text": "A"
      },
      {
      "boundingBox": "741,564,153,73",
      "text": "WISH"
      }
      ]
      }
      ]
      }
      ]
      }




      def check_for_word(ocr):
      # Initialise our subject to None
      print("OCR: {}".format(ocr))
      subject = None
      for region in ocr["regions"]:
      if "lines" in region:
      for lines in region["lines"]:
      if "words" in lines:
      for word in lines["words"]:
      if "text" in word:
      subject = word["text"].lower()
      break

      print("OCR word is {}".format(subject))
      return subject

      print(response["regions"][0]["lines"][0]["words"][0]["text"]) # Should return this

      print(check_for_word(response))



      • Each dictionary has arrays and we are unsure if the array contains any element

      • Also not sure if the dictionary has key


      Let's say we just wish to return the first text it matched from the image file.



      This code works but it has a deep nested structure that has bad smell. Is there a better practice to write this in a cleaner way?










      share|improve this question









      New contributor




      zcahfg2 is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
      Check out our Code of Conduct.











      The Microsoft OCR API returns json, and if I want to extact the text data from this json:



      response = 
      {
      "language": "en",
      "textAngle": -2.0000000000000338,
      "orientation": "Up",
      "regions": [
      {
      "boundingBox": "462,379,497,258",
      "lines": [
      {
      "boundingBox": "462,379,497,74",
      "words": [
      {
      "boundingBox": "462,379,41,73",
      "text": "A"
      },
      {
      "boundingBox": "523,379,153,73",
      "text": "GOAL"
      },
      {
      "boundingBox": "694,379,265,74",
      "text": "WITHOUT"
      }
      ]
      },
      {
      "boundingBox": "565,471,289,74",
      "words": [
      {
      "boundingBox": "565,471,41,73",
      "text": "A"
      },
      {
      "boundingBox": "626,471,150,73",
      "text": "PLAN"
      },
      {
      "boundingBox": "801,472,53,73",
      "text": "IS"
      }
      ]
      },
      {
      "boundingBox": "519,563,375,74",
      "words": [
      {
      "boundingBox": "519,563,149,74",
      "text": "JUST"
      },
      {
      "boundingBox": "683,564,41,72",
      "text": "A"
      },
      {
      "boundingBox": "741,564,153,73",
      "text": "WISH"
      }
      ]
      }
      ]
      }
      ]
      }




      def check_for_word(ocr):
      # Initialise our subject to None
      print("OCR: {}".format(ocr))
      subject = None
      for region in ocr["regions"]:
      if "lines" in region:
      for lines in region["lines"]:
      if "words" in lines:
      for word in lines["words"]:
      if "text" in word:
      subject = word["text"].lower()
      break

      print("OCR word is {}".format(subject))
      return subject

      print(response["regions"][0]["lines"][0]["words"][0]["text"]) # Should return this

      print(check_for_word(response))



      • Each dictionary has arrays and we are unsure if the array contains any element

      • Also not sure if the dictionary has key


      Let's say we just wish to return the first text it matched from the image file.



      This code works but it has a deep nested structure that has bad smell. Is there a better practice to write this in a cleaner way?







      python python-3.x






      share|improve this question









      New contributor




      zcahfg2 is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
      Check out our Code of Conduct.











      share|improve this question









      New contributor




      zcahfg2 is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
      Check out our Code of Conduct.









      share|improve this question




      share|improve this question








      edited Nov 15 at 17:50









      Toby Speight

      21.9k536108




      21.9k536108






      New contributor




      zcahfg2 is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
      Check out our Code of Conduct.









      asked Nov 15 at 16:19









      zcahfg2

      1184




      1184




      New contributor




      zcahfg2 is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
      Check out our Code of Conduct.





      New contributor





      zcahfg2 is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
      Check out our Code of Conduct.






      zcahfg2 is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
      Check out our Code of Conduct.






















          1 Answer
          1






          active

          oldest

          votes

















          up vote
          3
          down vote



          accepted










          One way to almost halve the number of lines (and levels of indentation) needed is to use dict.get with as the optional default option:



          def check_for_word(ocr):
          for region in ocr["regions"]:
          for lines in region.get("lines", ):
          for word in lines.get("words", ):
          if "text" in word:
          return word["text"].lower()
          else:
          raise KeyError("OCR word not found")


          I would also move the printing outside the function, so you can immediately return and add a else clause to catch the case that it is not present (this part could also be done outside with your code by checking for None).






          share|improve this answer























            Your Answer





            StackExchange.ifUsing("editor", function () {
            return StackExchange.using("mathjaxEditing", function () {
            StackExchange.MarkdownEditor.creationCallbacks.add(function (editor, postfix) {
            StackExchange.mathjaxEditing.prepareWmdForMathJax(editor, postfix, [["\$", "\$"]]);
            });
            });
            }, "mathjax-editing");

            StackExchange.ifUsing("editor", function () {
            StackExchange.using("externalEditor", function () {
            StackExchange.using("snippets", function () {
            StackExchange.snippets.init();
            });
            });
            }, "code-snippets");

            StackExchange.ready(function() {
            var channelOptions = {
            tags: "".split(" "),
            id: "196"
            };
            initTagRenderer("".split(" "), "".split(" "), channelOptions);

            StackExchange.using("externalEditor", function() {
            // Have to fire editor after snippets, if snippets enabled
            if (StackExchange.settings.snippets.snippetsEnabled) {
            StackExchange.using("snippets", function() {
            createEditor();
            });
            }
            else {
            createEditor();
            }
            });

            function createEditor() {
            StackExchange.prepareEditor({
            heartbeatType: 'answer',
            convertImagesToLinks: false,
            noModals: true,
            showLowRepImageUploadWarning: true,
            reputationToPostImages: null,
            bindNavPrevention: true,
            postfix: "",
            imageUploader: {
            brandingHtml: "Powered by u003ca class="icon-imgur-white" href="https://imgur.com/"u003eu003c/au003e",
            contentPolicyHtml: "User contributions licensed under u003ca href="https://creativecommons.org/licenses/by-sa/3.0/"u003ecc by-sa 3.0 with attribution requiredu003c/au003e u003ca href="https://stackoverflow.com/legal/content-policy"u003e(content policy)u003c/au003e",
            allowUrls: true
            },
            onDemand: true,
            discardSelector: ".discard-answer"
            ,immediatelyShowMarkdownHelp:true
            });


            }
            });






            zcahfg2 is a new contributor. Be nice, and check out our Code of Conduct.










             

            draft saved


            draft discarded


















            StackExchange.ready(
            function () {
            StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fcodereview.stackexchange.com%2fquestions%2f207733%2fretrieve-recognised-text-from-json-returned-by-microsoft-ocr%23new-answer', 'question_page');
            }
            );

            Post as a guest















            Required, but never shown

























            1 Answer
            1






            active

            oldest

            votes








            1 Answer
            1






            active

            oldest

            votes









            active

            oldest

            votes






            active

            oldest

            votes








            up vote
            3
            down vote



            accepted










            One way to almost halve the number of lines (and levels of indentation) needed is to use dict.get with as the optional default option:



            def check_for_word(ocr):
            for region in ocr["regions"]:
            for lines in region.get("lines", ):
            for word in lines.get("words", ):
            if "text" in word:
            return word["text"].lower()
            else:
            raise KeyError("OCR word not found")


            I would also move the printing outside the function, so you can immediately return and add a else clause to catch the case that it is not present (this part could also be done outside with your code by checking for None).






            share|improve this answer



























              up vote
              3
              down vote



              accepted










              One way to almost halve the number of lines (and levels of indentation) needed is to use dict.get with as the optional default option:



              def check_for_word(ocr):
              for region in ocr["regions"]:
              for lines in region.get("lines", ):
              for word in lines.get("words", ):
              if "text" in word:
              return word["text"].lower()
              else:
              raise KeyError("OCR word not found")


              I would also move the printing outside the function, so you can immediately return and add a else clause to catch the case that it is not present (this part could also be done outside with your code by checking for None).






              share|improve this answer

























                up vote
                3
                down vote



                accepted







                up vote
                3
                down vote



                accepted






                One way to almost halve the number of lines (and levels of indentation) needed is to use dict.get with as the optional default option:



                def check_for_word(ocr):
                for region in ocr["regions"]:
                for lines in region.get("lines", ):
                for word in lines.get("words", ):
                if "text" in word:
                return word["text"].lower()
                else:
                raise KeyError("OCR word not found")


                I would also move the printing outside the function, so you can immediately return and add a else clause to catch the case that it is not present (this part could also be done outside with your code by checking for None).






                share|improve this answer














                One way to almost halve the number of lines (and levels of indentation) needed is to use dict.get with as the optional default option:



                def check_for_word(ocr):
                for region in ocr["regions"]:
                for lines in region.get("lines", ):
                for word in lines.get("words", ):
                if "text" in word:
                return word["text"].lower()
                else:
                raise KeyError("OCR word not found")


                I would also move the printing outside the function, so you can immediately return and add a else clause to catch the case that it is not present (this part could also be done outside with your code by checking for None).







                share|improve this answer














                share|improve this answer



                share|improve this answer








                edited Nov 15 at 19:07

























                answered Nov 15 at 17:15









                Graipher

                22k53183




                22k53183






















                    zcahfg2 is a new contributor. Be nice, and check out our Code of Conduct.










                     

                    draft saved


                    draft discarded


















                    zcahfg2 is a new contributor. Be nice, and check out our Code of Conduct.













                    zcahfg2 is a new contributor. Be nice, and check out our Code of Conduct.












                    zcahfg2 is a new contributor. Be nice, and check out our Code of Conduct.















                     


                    draft saved


                    draft discarded














                    StackExchange.ready(
                    function () {
                    StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fcodereview.stackexchange.com%2fquestions%2f207733%2fretrieve-recognised-text-from-json-returned-by-microsoft-ocr%23new-answer', 'question_page');
                    }
                    );

                    Post as a guest















                    Required, but never shown





















































                    Required, but never shown














                    Required, but never shown












                    Required, but never shown







                    Required, but never shown

































                    Required, but never shown














                    Required, but never shown












                    Required, but never shown







                    Required, but never shown







                    Popular posts from this blog

                    Morgemoulin

                    Scott Moir

                    Souastre